Hello, reader! Thank you for your interest in this work.

Let me guide you through the steps taken to conduct the statistical analyses reported on our paper.

Preparation

Loading libraries

Here are the libraries that we are going to employ. If you have not already, remember to install them via install.packages() first.

library(tidyverse)
library(cowplot)
library(randomForest)
library(caret)
library(networkD3)
library(gridExtra)
library(RColorBrewer)
library(extrafont)
library(corrplot)
library(sjmisc)
library(sjlabelled)
library(ggridges)
library(reshape2)

Loading data sets

Next, we are going to load the three data sets obtained after computing the measures needed for our model with Python: one .CSV file for the graded readers corpus, another one for the literary works corpus, and the last one for the reference corpus of general Spanish. Furthermore, we will also load some stats computed by graded lexical item.

# === GRADED READERS ===

# Corpus
readers_corpus <- read_csv('Data/readers.csv')

# Vocabulary stats
readers_vocab_beginner <- read_csv('Data/Vocabulary/beginner.csv')
readers_vocab_intermediate <- read_csv('Data/Vocabulary/intermediate.csv')
readers_vocab_advanced <- read_csv('Data/Vocabulary/advanced.csv')


# === LITERARY WORKS ===

# Corpus
literature_corpus <- read_csv('Data/literature.csv')

# Vocabulary stats
literature_vocab_children <- read_csv('Data/Vocabulary/children.csv')
literature_vocab_youth <- read_csv('Data/Vocabulary/youth.csv')
literature_vocab_adult <- read_csv('Data/Vocabulary/adult.csv')


# === REFERENCE CORPUS ===

# Corpus
reference_corpus <- read_csv('Data/reference.csv')

# Vocabulary stats
reference_vocab <- read_csv('Data/Vocabulary/reference.csv')

Getting the data sets nice and ready

The original data sets have way too many columns. Now we are going to select only the ones relevant to the statistical analyses. We are also going to add up the column values of auxiliary and regular verb features.

# ================================ GRADED READERS DATA SET ================================

# STEP 1. Select relevant columns
selected_columns <- c("Freq A1", "Freq A2", "Freq B1", "Freq B2",
                      "TFIDF A1", "TFIDF A2", "TFIDF B1", "TFIDF B2",
                      "Tree", "Context freq", "Context TFIDF", "Context tree",
                      "deprel-mark DOC%", "deprel-advcl DOC%", "deprel-xcomp DOC%",
                      "deprel-ccomp DOC%", "deprel-acl DOC%", "deprel-nmod DOC%",
                      "deprel-conj DOC%", "deprel-appos DOC%",
                      "upos-AUX-Tense-Pres DOC%", "upos-AUX-Tense-Past DOC%",
                      "upos-AUX-Tense-Fut DOC%", "upos-AUX-Tense-Imp DOC%",
                      "upos-AUX-Mood-Ind DOC%", "upos-AUX-Mood-Sub DOC%",
                      "upos-AUX-Mood-Cnd DOC%", "upos-VERB-Tense-Pres DOC%",
                      "upos-VERB-Tense-Past DOC%", "upos-VERB-Tense-Fut DOC%",
                      "upos-VERB-Tense-Imp DOC%", "upos-VERB-Mood-Ind DOC%",
                      "upos-VERB-Mood-Sub DOC%", "upos-VERB-Mood-Cnd DOC%",
                      "Lexical density", "msttr", "mtld", "hdd", "Herdan")

readers_corpus <- readers_corpus %>% dplyr::select("Title", "Level",
                                                   all_of(selected_columns))

# STEP 2.1 Add up A1 and A2 vocabulary
readers_corpus <- readers_corpus %>%
  mutate(`Freq A1-A2` = `Freq A1` + `Freq A2`) %>%
  select(1:(match("Freq A1", names(.)) - 1),  # Columns before "Freq A1"
         `Freq A1-A2`,                         # New column
         (match("Freq A2", names(.)) + 1):ncol(.))  # Columns after "Freq A2"

readers_corpus <- readers_corpus %>%
  mutate(`TFIDF A1-A2` = `TFIDF A1` + `TFIDF A2`) %>%
  select(1:(match("TFIDF A1", names(.)) - 1),  # Columns before "Freq A1"
         `TFIDF A1-A2`,                         # New column
         (match("TFIDF A2", names(.)) + 1):ncol(.))  # Columns after "Freq A2"


# STEP 2.2 Add up auxiliary and regular verb features in a single column
readers_corpus$`upos-VERB-Mood-Ind DOC%` <-
  readers_corpus$`upos-AUX-Mood-Ind DOC%` + readers_corpus$`upos-VERB-Mood-Ind DOC%`
readers_corpus$`upos-VERB-Mood-Sub DOC%` <-
  readers_corpus$`upos-AUX-Mood-Sub DOC%` + readers_corpus$`upos-VERB-Mood-Sub DOC%`
readers_corpus$`upos-VERB-Mood-Cnd DOC%` <-
  readers_corpus$`upos-AUX-Mood-Cnd DOC%` + readers_corpus$`upos-VERB-Mood-Cnd DOC%`
readers_corpus$`upos-VERB-Tense-Pres DOC%` <-
  readers_corpus$`upos-AUX-Tense-Pres DOC%` + readers_corpus$`upos-VERB-Tense-Pres DOC%`
readers_corpus$`upos-VERB-Tense-Imp DOC%` <-
  readers_corpus$`upos-AUX-Tense-Imp DOC%` + readers_corpus$`upos-VERB-Tense-Imp DOC%`
readers_corpus$`upos-VERB-Tense-Past DOC%` <-
  readers_corpus$`upos-AUX-Tense-Past DOC%` + readers_corpus$`upos-VERB-Tense-Past DOC%`
readers_corpus$`upos-VERB-Tense-Fut DOC%` <-
  readers_corpus$`upos-AUX-Tense-Fut DOC%` + readers_corpus$`upos-VERB-Tense-Fut DOC%`

# STEP 3. Drop auxiliary verb feature columns
readers_corpus <- dplyr::select(readers_corpus, -c(21:27))

# ================================== LITERATURE DATA SET ==================================

# STEP 1. Select relevant columns
literature_corpus <- literature_corpus %>% dplyr::select("Title", "Level",
                                                         all_of(selected_columns))

# STEP 2.1 Add up A1 and A2 vocabulary
literature_corpus <- literature_corpus %>%
  mutate(`Freq A1-A2` = `Freq A1` + `Freq A2`) %>%
  select(1:(match("Freq A1", names(.)) - 1),  # Columns before "Freq A1"
         `Freq A1-A2`,                         # New column
         (match("Freq A2", names(.)) + 1):ncol(.))  # Columns after "Freq A2"

literature_corpus <- literature_corpus %>%
  mutate(`TFIDF A1-A2` = `TFIDF A1` + `TFIDF A2`) %>%
  select(1:(match("TFIDF A1", names(.)) - 1),  # Columns before "Freq A1"
         `TFIDF A1-A2`,                         # New column
         (match("TFIDF A2", names(.)) + 1):ncol(.))  # Columns after "Freq A2"


# STEP 2.2 Add up auxiliary and regular verb features in a single column
literature_corpus$`upos-VERB-Mood-Ind DOC%` <-
  literature_corpus$`upos-AUX-Mood-Ind DOC%` + literature_corpus$`upos-VERB-Mood-Ind DOC%`
literature_corpus$`upos-VERB-Mood-Sub DOC%` <-
  literature_corpus$`upos-AUX-Mood-Sub DOC%` + literature_corpus$`upos-VERB-Mood-Sub DOC%`
literature_corpus$`upos-VERB-Mood-Cnd DOC%` <-
  literature_corpus$`upos-AUX-Mood-Cnd DOC%` + literature_corpus$`upos-VERB-Mood-Cnd DOC%`
literature_corpus$`upos-VERB-Tense-Pres DOC%` <-
  literature_corpus$`upos-AUX-Tense-Pres DOC%` + literature_corpus$`upos-VERB-Tense-Pres DOC%`
literature_corpus$`upos-VERB-Tense-Imp DOC%` <-
  literature_corpus$`upos-AUX-Tense-Imp DOC%` + literature_corpus$`upos-VERB-Tense-Imp DOC%`
literature_corpus$`upos-VERB-Tense-Past DOC%` <-
  literature_corpus$`upos-AUX-Tense-Past DOC%` + literature_corpus$`upos-VERB-Tense-Past DOC%`
literature_corpus$`upos-VERB-Tense-Fut DOC%` <-
  literature_corpus$`upos-AUX-Tense-Fut DOC%` + literature_corpus$`upos-VERB-Tense-Fut DOC%`

# STEP 3. Drop auxiliary verb feature columns
literature_corpus <- dplyr::select(literature_corpus, -c(21:27))

# =============================== REFERENCE CORPUS DATA SET ===============================

# STEP 1. Select relevant columns
reference_corpus <- reference_corpus %>% dplyr::select(all_of(selected_columns))

# STEP 2.1 Add up A1 and A2 vocabulary
reference_corpus <- reference_corpus %>%
  mutate(`Freq A1-A2` = `Freq A1` + `Freq A2`) %>%
  select(`Freq A1-A2`,                         # New column
         (match("Freq A2", names(.)) + 1):ncol(.))  # Columns after "Freq A2"

reference_corpus <- reference_corpus %>%
  mutate(`TFIDF A1-A2` = `TFIDF A1` + `TFIDF A2`) %>%
  select(1:(match("TFIDF A1", names(.)) - 1),
         `TFIDF A1-A2`,                         # New column
         (match("TFIDF A2", names(.)) + 1):ncol(.))  # Columns after "Freq A2"

# STEP 2.2 Add up auxiliary and regular verb features in a single column
reference_corpus$`upos-VERB-Mood-Ind DOC%` <-
  reference_corpus$`upos-AUX-Mood-Ind DOC%` + reference_corpus$`upos-VERB-Mood-Ind DOC%`
reference_corpus$`upos-VERB-Mood-Sub DOC%` <-
  reference_corpus$`upos-AUX-Mood-Sub DOC%` + reference_corpus$`upos-VERB-Mood-Sub DOC%`
reference_corpus$`upos-VERB-Mood-Cnd DOC%` <-
  reference_corpus$`upos-AUX-Mood-Cnd DOC%` + reference_corpus$`upos-VERB-Mood-Cnd DOC%`
reference_corpus$`upos-VERB-Tense-Pres DOC%` <-
  reference_corpus$`upos-AUX-Tense-Pres DOC%` + reference_corpus$`upos-VERB-Tense-Pres DOC%`
reference_corpus$`upos-VERB-Tense-Imp DOC%` <-
  reference_corpus$`upos-AUX-Tense-Imp DOC%` + reference_corpus$`upos-VERB-Tense-Imp DOC%`
reference_corpus$`upos-VERB-Tense-Past DOC%` <-
  reference_corpus$`upos-AUX-Tense-Past DOC%` + reference_corpus$`upos-VERB-Tense-Past DOC%`
reference_corpus$`upos-VERB-Tense-Fut DOC%` <-
  reference_corpus$`upos-AUX-Tense-Fut DOC%` + reference_corpus$`upos-VERB-Tense-Fut DOC%`

# STEP 3. Drop auxiliary verb feature columns
reference_corpus <- dplyr::select(reference_corpus, -c(19:25))

That’s starting to look a lot better now! However, since in the original file all parse tree properties are in a tuple in the same column, we still need to tweak the data frame columns a little bit more:

# STEP 1. REMOVE TUPLE PARENTHESES
# --------------------------------

# Function to remove parentheses
strip_parens <- function(x) gsub('[()]', '', x)

# Remove ( and ) from the beginning and the end of each tuple.
# Apply the function per row (2 = column, 1 = row).

# === Graded readers ===
readers_corpus[, 'Tree'] <- data.frame(apply(readers_corpus[, 'Tree'], 2, strip_parens))
readers_corpus[, 'Context tree'] <- data.frame(apply(readers_corpus[, 'Context tree'], 2, strip_parens))

# === Literary works ===
literature_corpus[, 'Tree'] <- data.frame(apply(literature_corpus[, 'Tree'], 2, strip_parens))
literature_corpus[, 'Context tree'] <- data.frame(apply(literature_corpus[, 'Context tree'], 2, strip_parens))

# === Reference corpus ===
reference_corpus[, 'Tree'] <- data.frame(apply(reference_corpus[, 'Tree'], 2, strip_parens))
reference_corpus[, 'Context tree'] <- data.frame(apply(reference_corpus[, 'Context tree'], 2, strip_parens))

# STEP 2. CREATE NEW COLUMNS AND DROP OLD ONES
# --------------------------------------------

new_tree_columns <- c('Min_Width', 'Max_Width', 'Avg_Width',
                      'Min_Depth', 'Max_Depth', 'Avg_Depth')
new_context_tree_columns <- c('Ctxt_Min_Width', 'Ctxt_Max_Width', 'Ctxt_Avg_Width',
                              'Ctxt_Min_Depth', 'Ctxt_Max_Depth', 'Ctxt_Avg_Depth')

# === Graded readers ===
readers_corpus[new_tree_columns] <- str_split_fixed(readers_corpus$Tree, ', ', 6)
readers_corpus[new_context_tree_columns] <- str_split_fixed(readers_corpus$`Context tree`, ', ', 6)
readers_corpus <- dplyr::select(readers_corpus, -c('Tree', 'Context tree'))

# === Literary works ===
literature_corpus[new_tree_columns] <- str_split_fixed(literature_corpus$Tree, ', ', 6)
literature_corpus[new_context_tree_columns] <- str_split_fixed(literature_corpus$`Context tree`, ', ', 6)
literature_corpus <- dplyr::select(literature_corpus, -c('Tree', 'Context tree'))

# === Reference corpus ===
reference_corpus[new_tree_columns] <- str_split_fixed(reference_corpus$Tree, ', ', 6)
reference_corpus[new_context_tree_columns] <- str_split_fixed(reference_corpus$`Context tree`, ', ', 6)
reference_corpus <- dplyr::select(reference_corpus, -c('Tree', 'Context tree'))

# STEP 3. CONVERT NEW COLUMNS TO NUMERIC TYPE VECTORS
# ---------------------------------------------------

# === Graded readers ===
readers_corpus[, new_tree_columns] <- apply(readers_corpus[, new_tree_columns], 2, function(x) as.numeric(x))
readers_corpus[, new_context_tree_columns] <- apply(readers_corpus[, new_context_tree_columns], 2,
                                                    function(x) as.numeric(x))

# === Literary works ===
literature_corpus[, new_tree_columns] <- apply(literature_corpus[, new_tree_columns], 2,
                                               function(x) as.numeric(x))
literature_corpus[, new_context_tree_columns] <- apply(literature_corpus[, new_context_tree_columns], 2,
                                                       function(x) as.numeric(x))

# === Reference corpus ===
reference_corpus[, new_tree_columns] <- apply(reference_corpus[, new_tree_columns], 2,
                                              function(x) as.numeric(x))
reference_corpus[, new_context_tree_columns] <- apply(reference_corpus[, new_context_tree_columns], 2,
                                                      function(x) as.numeric(x))

Last but not least, we are going to rename some columns so that they are easier to operate with.

# === Graded readers ===
readers_corpus <- readers_corpus %>%
  rename(`Freq_A1A2` = `Freq A1-A2`,
         `Freq_B1` = `Freq B1`,
         `Freq_B2` = `Freq B2`,
         `TFIDF_A1A2` = `TFIDF A1-A2`,
         `TFIDF_B1` = `TFIDF B1`,
         `TFIDF_B2` = `TFIDF B2`,
         `Ctxt_Freq` = `Context freq`,
         `Ctxt_TFIDF` = `Context TFIDF`,
         `Freq_Markers` = `deprel-mark DOC%`,
         `Freq_AdvCl` = `deprel-advcl DOC%`,
         `Freq_XComp` = `deprel-xcomp DOC%`,
         `Freq_CComp` = `deprel-ccomp DOC%`,
         `Freq_ACl` = `deprel-acl DOC%`,
         `Freq_NMod` = `deprel-nmod DOC%`,
         `Freq_Conj` = `deprel-conj DOC%`,
         `Freq_Appos` = `deprel-appos DOC%`,
         `Freq_Past` = `upos-VERB-Tense-Past DOC%`,
         `Freq_Pres` = `upos-VERB-Tense-Pres DOC%`,
         `Freq_Fut` = `upos-VERB-Tense-Fut DOC%`,
         `Freq_Imp` = `upos-VERB-Tense-Imp DOC%`,
         `Freq_Ind` = `upos-VERB-Mood-Ind DOC%`,
         `Freq_Sub` = `upos-VERB-Mood-Sub DOC%`,
         `Freq_Cnd` = `upos-VERB-Mood-Cnd DOC%`,
         `Density` = `Lexical density`)

# === Literary works ===
literature_corpus <- literature_corpus %>%
  rename(`Freq_A1A2` = `Freq A1-A2`,
         `Freq_B1` = `Freq B1`,
         `Freq_B2` = `Freq B2`,
         `TFIDF_A1A2` = `TFIDF A1-A2`,
         `TFIDF_B1` = `TFIDF B1`,
         `TFIDF_B2` = `TFIDF B2`,
         `Ctxt_Freq` = `Context freq`,
         `Ctxt_TFIDF` = `Context TFIDF`,
         `Freq_Markers` = `deprel-mark DOC%`,
         `Freq_AdvCl` = `deprel-advcl DOC%`,
         `Freq_XComp` = `deprel-xcomp DOC%`,
         `Freq_CComp` = `deprel-ccomp DOC%`,
         `Freq_ACl` = `deprel-acl DOC%`,
         `Freq_NMod` = `deprel-nmod DOC%`,
         `Freq_Conj` = `deprel-conj DOC%`,
         `Freq_Appos` = `deprel-appos DOC%`,
         `Freq_Past` = `upos-VERB-Tense-Past DOC%`,
         `Freq_Pres` = `upos-VERB-Tense-Pres DOC%`,
         `Freq_Fut` = `upos-VERB-Tense-Fut DOC%`,
         `Freq_Imp` = `upos-VERB-Tense-Imp DOC%`,
         `Freq_Ind` = `upos-VERB-Mood-Ind DOC%`,
         `Freq_Sub` = `upos-VERB-Mood-Sub DOC%`,
         `Freq_Cnd` = `upos-VERB-Mood-Cnd DOC%`,
         `Density` = `Lexical density`)

# === Reference corpus ===
reference_corpus <- reference_corpus %>%
  rename(`Freq_A1A2` = `Freq A1-A2`,
         `Freq_B1` = `Freq B1`,
         `Freq_B2` = `Freq B2`,
         `TFIDF_A1A2` = `TFIDF A1-A2`,
         `TFIDF_B1` = `TFIDF B1`,
         `TFIDF_B2` = `TFIDF B2`,
         `Ctxt_Freq` = `Context freq`,
         `Ctxt_TFIDF` = `Context TFIDF`,
         `Freq_Markers` = `deprel-mark DOC%`,
         `Freq_AdvCl` = `deprel-advcl DOC%`,
         `Freq_XComp` = `deprel-xcomp DOC%`,
         `Freq_CComp` = `deprel-ccomp DOC%`,
         `Freq_ACl` = `deprel-acl DOC%`,
         `Freq_NMod` = `deprel-nmod DOC%`,
         `Freq_Conj` = `deprel-conj DOC%`,
         `Freq_Appos` = `deprel-appos DOC%`,
         `Freq_Past` = `upos-VERB-Tense-Past DOC%`,
         `Freq_Pres` = `upos-VERB-Tense-Pres DOC%`,
         `Freq_Fut` = `upos-VERB-Tense-Fut DOC%`,
         `Freq_Imp` = `upos-VERB-Tense-Imp DOC%`,
         `Freq_Ind` = `upos-VERB-Mood-Ind DOC%`,
         `Freq_Sub` = `upos-VERB-Mood-Sub DOC%`,
         `Freq_Cnd` = `upos-VERB-Mood-Cnd DOC%`,
         `Density` = `Lexical density`)

COLUMN EXPLANATION

Let’s take a look at the data sets now!

summary(readers_corpus)
##     Title              Level             Freq_A1A2         Freq_B1       
##  Length:50          Length:50          Min.   :0.1138   Min.   :0.04235  
##  Class :character   Class :character   1st Qu.:0.1906   1st Qu.:0.05429  
##  Mode  :character   Mode  :character   Median :0.2106   Median :0.05909  
##                                        Mean   :0.2084   Mean   :0.06103  
##                                        3rd Qu.:0.2309   3rd Qu.:0.06636  
##                                        Max.   :0.2763   Max.   :0.09866  
##     Freq_B2          TFIDF_A1A2         TFIDF_B1          TFIDF_B2       
##  Min.   :0.01248   Min.   :0.04578   Min.   :0.01906   Min.   :0.007626  
##  1st Qu.:0.02144   1st Qu.:0.08192   1st Qu.:0.03690   1st Qu.:0.019693  
##  Median :0.02546   Median :0.09286   Median :0.04217   Median :0.024310  
##  Mean   :0.02848   Mean   :0.09454   Mean   :0.04410   Mean   :0.028034  
##  3rd Qu.:0.03223   3rd Qu.:0.10722   3rd Qu.:0.05141   3rd Qu.:0.029478  
##  Max.   :0.07377   Max.   :0.14312   Max.   :0.07531   Max.   :0.089192  
##    Ctxt_Freq           Ctxt_TFIDF         Freq_Markers       Freq_AdvCl     
##  Min.   :0.0003962   Min.   :0.0003924   Min.   :0.00898   Min.   :0.00651  
##  1st Qu.:0.0007826   1st Qu.:0.0005900   1st Qu.:0.02218   1st Qu.:0.01850  
##  Median :0.0013098   Median :0.0008089   Median :0.02825   Median :0.02314  
##  Mean   :0.0014201   Mean   :0.0009030   Mean   :0.03311   Mean   :0.02477  
##  3rd Qu.:0.0018667   3rd Qu.:0.0009965   3rd Qu.:0.04008   3rd Qu.:0.02956  
##  Max.   :0.0033756   Max.   :0.0024383   Max.   :0.10017   Max.   :0.04884  
##    Freq_XComp         Freq_CComp           Freq_ACl          Freq_NMod      
##  Min.   :0.002604   Min.   :0.0008453   Min.   :0.001559   Min.   :0.00815  
##  1st Qu.:0.010994   1st Qu.:0.0078364   1st Qu.:0.007381   1st Qu.:0.02439  
##  Median :0.013102   Median :0.0124764   Median :0.009757   Median :0.02905  
##  Mean   :0.013646   Mean   :0.0120540   Mean   :0.010855   Mean   :0.03131  
##  3rd Qu.:0.016296   3rd Qu.:0.0153399   3rd Qu.:0.014058   3rd Qu.:0.03624  
##  Max.   :0.027732   Max.   :0.0321189   Max.   :0.023438   Max.   :0.07552  
##    Freq_Conj         Freq_Appos         Freq_Pres          Freq_Past        
##  Min.   :0.02023   Min.   :0.000000   Min.   :0.002604   Min.   :0.0002351  
##  1st Qu.:0.02718   1st Qu.:0.006202   1st Qu.:0.041965   1st Qu.:0.0094868  
##  Median :0.03126   Median :0.008568   Median :0.082327   Median :0.0212278  
##  Mean   :0.03296   Mean   :0.010089   Mean   :0.074371   Mean   :0.0261291  
##  3rd Qu.:0.03806   3rd Qu.:0.013424   3rd Qu.:0.105707   3rd Qu.:0.0422629  
##  Max.   :0.06891   Max.   :0.034286   Max.   :0.131641   Max.   :0.0757431  
##     Freq_Fut            Freq_Imp           Freq_Ind         Freq_Sub        
##  Min.   :0.0000000   Min.   :0.000000   Min.   :0.0599   Min.   :0.0000000  
##  1st Qu.:0.0001297   1st Qu.:0.001783   1st Qu.:0.1035   1st Qu.:0.0006185  
##  Median :0.0014752   Median :0.011088   Median :0.1128   Median :0.0016107  
##  Mean   :0.0020725   Mean   :0.018601   Mean   :0.1104   Mean   :0.0034776  
##  3rd Qu.:0.0031379   3rd Qu.:0.031652   3rd Qu.:0.1210   3rd Qu.:0.0050158  
##  Max.   :0.0081425   Max.   :0.064645   Max.   :0.1481   Max.   :0.0219371  
##     Freq_Cnd            Density           msttr             mtld       
##  Min.   :0.0000000   Min.   :0.3874   Min.   :0.6550   Min.   : 53.36  
##  1st Qu.:0.0000000   1st Qu.:0.4077   1st Qu.:0.7072   1st Qu.: 83.61  
##  Median :0.0004377   Median :0.4182   Median :0.7261   Median : 95.08  
##  Mean   :0.0010487   Mean   :0.4199   Mean   :0.7227   Mean   : 95.09  
##  3rd Qu.:0.0012980   3rd Qu.:0.4269   3rd Qu.:0.7385   3rd Qu.:107.60  
##  Max.   :0.0092402   Max.   :0.4956   Max.   :0.7658   Max.   :127.83  
##       hdd             Herdan         Min_Width      Max_Width    
##  Min.   :0.8258   Min.   :0.8082   Min.   :1.00   Min.   :11.00  
##  1st Qu.:0.8481   1st Qu.:0.8417   1st Qu.:1.00   1st Qu.:15.00  
##  Median :0.8634   Median :0.8640   Median :2.00   Median :17.00  
##  Mean   :0.8592   Mean   :0.8606   Mean   :1.72   Mean   :18.32  
##  3rd Qu.:0.8687   3rd Qu.:0.8766   3rd Qu.:2.00   3rd Qu.:21.50  
##  Max.   :0.8845   Max.   :0.9086   Max.   :4.00   Max.   :32.00  
##    Avg_Width        Min_Depth     Max_Depth       Avg_Depth     Ctxt_Min_Width
##  Min.   : 5.219   Min.   :1.0   Min.   : 5.00   Min.   :3.106   Min.   :1.0   
##  1st Qu.: 6.021   1st Qu.:1.0   1st Qu.: 7.00   1st Qu.:3.436   1st Qu.:1.0   
##  Median : 6.967   Median :1.0   Median : 8.00   Median :4.058   Median :1.0   
##  Mean   : 7.362   Mean   :1.1   Mean   : 9.66   Mean   :4.277   Mean   :1.5   
##  3rd Qu.: 8.257   3rd Qu.:1.0   3rd Qu.:11.00   3rd Qu.:4.836   3rd Qu.:2.0   
##  Max.   :11.956   Max.   :2.0   Max.   :20.00   Max.   :7.005   Max.   :4.0   
##  Ctxt_Max_Width  Ctxt_Avg_Width  Ctxt_Min_Depth Ctxt_Max_Depth  Ctxt_Avg_Depth 
##  Min.   :11.00   Min.   :4.406   Min.   :1.00   Min.   : 5.00   Min.   :2.476  
##  1st Qu.:15.00   1st Qu.:5.047   1st Qu.:1.00   1st Qu.: 7.00   1st Qu.:2.725  
##  Median :17.00   Median :5.598   Median :1.00   Median : 8.00   Median :3.055  
##  Mean   :18.32   Mean   :6.031   Mean   :1.08   Mean   : 9.66   Mean   :3.406  
##  3rd Qu.:21.50   3rd Qu.:6.616   3rd Qu.:1.00   3rd Qu.:11.00   3rd Qu.:3.705  
##  Max.   :32.00   Max.   :9.845   Max.   :2.00   Max.   :20.00   Max.   :5.501
summary(literature_corpus)
##     Title              Level             Freq_A1A2         Freq_B1       
##  Length:50          Length:50          Min.   :0.1136   Min.   :0.04577  
##  Class :character   Class :character   1st Qu.:0.1794   1st Qu.:0.05556  
##  Mode  :character   Mode  :character   Median :0.1904   Median :0.05977  
##                                        Mean   :0.1920   Mean   :0.06323  
##                                        3rd Qu.:0.2047   3rd Qu.:0.06565  
##                                        Max.   :0.2898   Max.   :0.13870  
##     Freq_B2          TFIDF_A1A2         TFIDF_B1          TFIDF_B2      
##  Min.   :0.01890   Min.   :0.04379   Min.   :0.02675   Min.   :0.01218  
##  1st Qu.:0.02767   1st Qu.:0.06263   1st Qu.:0.03314   1st Qu.:0.02141  
##  Median :0.03167   Median :0.07404   Median :0.03957   Median :0.02750  
##  Mean   :0.03274   Mean   :0.08005   Mean   :0.04084   Mean   :0.02817  
##  3rd Qu.:0.03599   3rd Qu.:0.09266   3rd Qu.:0.04430   3rd Qu.:0.03447  
##  Max.   :0.05742   Max.   :0.13644   Max.   :0.10853   Max.   :0.05326  
##    Ctxt_Freq           Ctxt_TFIDF         Freq_Markers       Freq_AdvCl     
##  Min.   :0.0004093   Min.   :0.0003712   Min.   :0.02626   Min.   :0.01377  
##  1st Qu.:0.0005888   1st Qu.:0.0004827   1st Qu.:0.03710   1st Qu.:0.02558  
##  Median :0.0011018   Median :0.0007973   Median :0.04287   Median :0.02995  
##  Mean   :0.0015707   Mean   :0.0009023   Mean   :0.04336   Mean   :0.03137  
##  3rd Qu.:0.0021075   3rd Qu.:0.0012378   3rd Qu.:0.04855   3rd Qu.:0.03394  
##  Max.   :0.0047613   Max.   :0.0024171   Max.   :0.07274   Max.   :0.05173  
##    Freq_XComp         Freq_CComp          Freq_ACl          Freq_NMod       
##  Min.   :0.005269   Min.   :0.004918   Min.   :0.003701   Min.   :0.008239  
##  1st Qu.:0.012109   1st Qu.:0.011840   1st Qu.:0.010534   1st Qu.:0.022233  
##  Median :0.015944   Median :0.014177   Median :0.013848   Median :0.025579  
##  Mean   :0.015815   Mean   :0.014730   Mean   :0.013829   Mean   :0.027059  
##  3rd Qu.:0.018179   3rd Qu.:0.017407   3rd Qu.:0.017341   3rd Qu.:0.030829  
##  Max.   :0.032590   Max.   :0.027444   Max.   :0.021779   Max.   :0.053589  
##    Freq_Conj         Freq_Appos         Freq_Pres          Freq_Past       
##  Min.   :0.01119   Min.   :0.001083   Min.   :0.003279   Min.   :0.009864  
##  1st Qu.:0.02910   1st Qu.:0.002440   1st Qu.:0.021214   1st Qu.:0.042088  
##  Median :0.03228   Median :0.004649   Median :0.032619   Median :0.048117  
##  Mean   :0.03279   Mean   :0.004911   Mean   :0.038143   Mean   :0.047313  
##  3rd Qu.:0.03687   3rd Qu.:0.005939   3rd Qu.:0.048881   3rd Qu.:0.059149  
##  Max.   :0.06856   Max.   :0.013388   Max.   :0.138101   Max.   :0.078007  
##     Freq_Fut           Freq_Imp          Freq_Ind          Freq_Sub        
##  Min.   :0.000000   Min.   :0.00000   Min.   :0.07315   Min.   :0.0009794  
##  1st Qu.:0.001155   1st Qu.:0.02588   1st Qu.:0.10459   1st Qu.:0.0036746  
##  Median :0.003112   Median :0.03885   Median :0.11307   Median :0.0051907  
##  Mean   :0.003355   Mean   :0.03794   Mean   :0.11277   Mean   :0.0055350  
##  3rd Qu.:0.004448   3rd Qu.:0.04964   3rd Qu.:0.11988   3rd Qu.:0.0069880  
##  Max.   :0.012330   Max.   :0.06690   Max.   :0.14673   Max.   :0.0117270  
##     Freq_Cnd            Density           msttr             mtld       
##  Min.   :0.0000000   Min.   :0.3925   Min.   :0.6805   Min.   : 73.51  
##  1st Qu.:0.0007332   1st Qu.:0.4118   1st Qu.:0.7202   1st Qu.: 93.83  
##  Median :0.0014536   Median :0.4209   Median :0.7309   Median :101.42  
##  Mean   :0.0017794   Mean   :0.4240   Mean   :0.7312   Mean   :103.76  
##  3rd Qu.:0.0026394   3rd Qu.:0.4351   3rd Qu.:0.7433   3rd Qu.:110.78  
##  Max.   :0.0057773   Max.   :0.4871   Max.   :0.7967   Max.   :154.88  
##       hdd             Herdan         Min_Width      Max_Width    
##  Min.   :0.8243   Min.   :0.8337   Min.   :1.00   Min.   : 7.00  
##  1st Qu.:0.8437   1st Qu.:0.8562   1st Qu.:2.00   1st Qu.:14.00  
##  Median :0.8537   Median :0.8678   Median :2.00   Median :18.00  
##  Mean   :0.8550   Mean   :0.8691   Mean   :2.14   Mean   :18.26  
##  3rd Qu.:0.8659   3rd Qu.:0.8848   3rd Qu.:2.75   3rd Qu.:21.00  
##  Max.   :0.8899   Max.   :0.9171   Max.   :5.00   Max.   :33.00  
##    Avg_Width        Min_Depth      Max_Depth       Avg_Depth     Ctxt_Min_Width
##  Min.   : 4.172   Min.   :1.00   Min.   : 5.00   Min.   :2.956   Min.   :1.00  
##  1st Qu.: 7.011   1st Qu.:1.00   1st Qu.: 8.25   1st Qu.:4.149   1st Qu.:1.00  
##  Median : 7.854   Median :1.00   Median :10.00   Median :4.733   Median :2.00  
##  Mean   : 8.149   Mean   :1.08   Mean   :10.50   Mean   :4.658   Mean   :1.82  
##  3rd Qu.: 9.255   3rd Qu.:1.00   3rd Qu.:11.00   3rd Qu.:5.129   3rd Qu.:2.00  
##  Max.   :14.357   Max.   :2.00   Max.   :45.00   Max.   :7.750   Max.   :3.00  
##  Ctxt_Max_Width  Ctxt_Avg_Width   Ctxt_Min_Depth Ctxt_Max_Depth 
##  Min.   : 7.00   Min.   : 3.382   Min.   :1.00   Min.   : 5.00  
##  1st Qu.:14.00   1st Qu.: 6.271   1st Qu.:1.00   1st Qu.: 8.25  
##  Median :18.00   Median : 6.823   Median :1.00   Median :10.00  
##  Mean   :18.26   Mean   : 6.834   Mean   :1.02   Mean   :10.50  
##  3rd Qu.:21.00   3rd Qu.: 7.666   3rd Qu.:1.00   3rd Qu.:11.00  
##  Max.   :33.00   Max.   :11.036   Max.   :2.00   Max.   :45.00  
##  Ctxt_Avg_Depth 
##  Min.   :2.418  
##  1st Qu.:3.417  
##  Median :3.849  
##  Mean   :3.841  
##  3rd Qu.:4.204  
##  Max.   :5.795
summary(reference_corpus)
##    Freq_A1A2         Freq_B1           Freq_B2          TFIDF_A1A2      
##  Min.   :0.0163   Min.   :0.00000   Min.   :0.00000   Min.   :0.007464  
##  1st Qu.:0.1102   1st Qu.:0.05202   1st Qu.:0.04096   1st Qu.:0.101886  
##  Median :0.1282   Median :0.06459   Median :0.05285   Median :0.127978  
##  Mean   :0.1298   Mean   :0.06546   Mean   :0.05468   Mean   :0.131676  
##  3rd Qu.:0.1483   3rd Qu.:0.07826   3rd Qu.:0.06606   3rd Qu.:0.155976  
##  Max.   :0.2590   Max.   :0.20930   Max.   :0.18750   Max.   :0.303396  
##     TFIDF_B1          TFIDF_B2         Ctxt_Freq          Ctxt_TFIDF      
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.001719   Min.   :0.002539  
##  1st Qu.:0.07567   1st Qu.:0.06533   1st Qu.:0.003947   1st Qu.:0.005356  
##  Median :0.09512   Median :0.08563   Median :0.004920   Median :0.006680  
##  Mean   :0.09678   Mean   :0.08808   Mean   :0.006375   Mean   :0.008078  
##  3rd Qu.:0.11637   3rd Qu.:0.10737   3rd Qu.:0.006551   3rd Qu.:0.008613  
##  Max.   :0.26066   Max.   :0.24721   Max.   :0.042411   Max.   :0.048931  
##   Freq_Markers       Freq_AdvCl         Freq_XComp         Freq_CComp      
##  Min.   :0.00000   Min.   :0.000000   Min.   :0.000000   Min.   :0.000000  
##  1st Qu.:0.02222   1st Qu.:0.009662   1st Qu.:0.003205   1st Qu.:0.002924  
##  Median :0.03315   Median :0.015773   Median :0.007335   Median :0.008287  
##  Mean   :0.03442   Mean   :0.016769   Mean   :0.008506   Mean   :0.010376  
##  3rd Qu.:0.04589   3rd Qu.:0.022814   3rd Qu.:0.012526   3rd Qu.:0.015789  
##  Max.   :0.17241   Max.   :0.068182   Max.   :0.047619   Max.   :0.064516  
##     Freq_ACl         Freq_NMod         Freq_Conj         Freq_Appos     
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.01208   1st Qu.:0.04792   1st Qu.:0.02096   1st Qu.:0.00565  
##  Median :0.01724   Median :0.06468   Median :0.03030   Median :0.01149  
##  Mean   :0.01802   Mean   :0.06598   Mean   :0.03246   Mean   :0.01395  
##  3rd Qu.:0.02317   3rd Qu.:0.08156   3rd Qu.:0.04073   3rd Qu.:0.01833  
##  Max.   :0.06897   Max.   :0.20000   Max.   :0.11957   Max.   :0.14674  
##    Freq_Pres         Freq_Past          Freq_Fut           Freq_Imp       
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.000000   Min.   :0.000000  
##  1st Qu.:0.02148   1st Qu.:0.01544   1st Qu.:0.000000   1st Qu.:0.000000  
##  Median :0.03883   Median :0.02689   Median :0.000000   Median :0.003650  
##  Mean   :0.04205   Mean   :0.02876   Mean   :0.004208   Mean   :0.009057  
##  3rd Qu.:0.05902   3rd Qu.:0.04075   3rd Qu.:0.005736   3rd Qu.:0.011682  
##  Max.   :0.13548   Max.   :0.07812   Max.   :0.048276   Max.   :0.088328  
##     Freq_Ind          Freq_Sub           Freq_Cnd           Density      
##  Min.   :0.00000   Min.   :0.000000   Min.   :0.000000   Min.   :0.3112  
##  1st Qu.:0.05581   1st Qu.:0.000000   1st Qu.:0.000000   1st Qu.:0.4228  
##  Median :0.06838   Median :0.003731   Median :0.000000   Median :0.4396  
##  Mean   :0.07018   Mean   :0.005091   Mean   :0.001587   Mean   :0.4402  
##  3rd Qu.:0.08203   3rd Qu.:0.007595   3rd Qu.:0.002336   3rd Qu.:0.4572  
##  Max.   :0.14815   Max.   :0.047619   Max.   :0.066667   Max.   :0.5667  
##      msttr             mtld             hdd             Herdan      
##  Min.   :0.4900   Min.   : 21.00   Min.   :0.6518   Min.   :0.8355  
##  1st Qu.:0.6783   1st Qu.: 67.99   1st Qu.:0.7979   1st Qu.:0.8890  
##  Median :0.7050   Median : 81.42   Median :0.8183   Median :0.9017  
##  Mean   :0.7095   Mean   : 85.19   Mean   :0.8189   Mean   :0.9031  
##  3rd Qu.:0.7350   3rd Qu.: 97.89   3rd Qu.:0.8425   3rd Qu.:0.9153  
##  Max.   :1.0000   Max.   :202.39   Max.   :1.0000   Max.   :1.0000  
##    Min_Width        Max_Width       Avg_Width        Min_Depth     
##  Min.   : 1.000   Min.   : 4.00   Min.   : 3.800   Min.   : 1.000  
##  1st Qu.: 4.000   1st Qu.:13.00   1st Qu.: 9.198   1st Qu.: 2.000  
##  Median : 5.000   Median :16.00   Median :10.881   Median : 3.000  
##  Mean   : 5.356   Mean   :15.89   Mean   :10.756   Mean   : 3.328  
##  3rd Qu.: 7.000   3rd Qu.:18.00   3rd Qu.:12.204   3rd Qu.: 4.000  
##  Max.   :22.000   Max.   :56.00   Max.   :22.000   Max.   :10.000  
##    Max_Depth        Avg_Depth      Ctxt_Min_Width   Ctxt_Max_Width
##  Min.   : 2.000   Min.   : 1.800   Min.   : 1.000   Min.   : 4.0  
##  1st Qu.: 7.000   1st Qu.: 5.423   1st Qu.: 3.000   1st Qu.:13.0  
##  Median : 9.000   Median : 6.260   Median : 5.000   Median :16.0  
##  Mean   : 8.834   Mean   : 6.246   Mean   : 5.164   Mean   :15.9  
##  3rd Qu.:10.000   3rd Qu.: 7.042   3rd Qu.: 6.000   3rd Qu.:18.0  
##  Max.   :20.000   Max.   :11.886   Max.   :22.000   Max.   :56.0  
##  Ctxt_Avg_Width   Ctxt_Min_Depth   Ctxt_Max_Depth   Ctxt_Avg_Depth  
##  Min.   : 3.667   Min.   : 1.000   Min.   : 2.000   Min.   : 1.667  
##  1st Qu.: 8.200   1st Qu.: 2.000   1st Qu.: 7.000   1st Qu.: 4.800  
##  Median : 9.778   Median : 3.000   Median : 9.000   Median : 5.700  
##  Mean   : 9.752   Mean   : 3.195   Mean   : 8.834   Mean   : 5.682  
##  3rd Qu.:11.222   3rd Qu.: 4.000   3rd Qu.:10.000   3rd Qu.: 6.545  
##  Max.   :22.000   Max.   :10.000   Max.   :20.000   Max.   :10.500

The graded readers and the literary works tibbles have the exact same structure: 42 columns (variables) x 50 rows (works considered). The reference corpus has virtually the same columns, save for the first two (Title and Level), which are not there because we make no distinction among the texts composing the corpus. The number of rows, however, is different, since this corpus is made up of 609 files. Therefore, the resulting tibble has a structure of 40 columns x 609 rows.

Data exploration and visualisation

DESCRIPTIVE STATISTICS

Occurring LI

Sankey charts depicting the relative frequency of graded vocabulary occurrences per corpus and lexical item level.

Parse trees’ properties

Violin plots depicting the properties of the parse trees of the sentences where the graded vocabulary and its context occur.

Relative frequencies of moods

Density ridgeline plots

## Picking joint bandwidth of 0.00694
## Picking joint bandwidth of 0.0014
## Picking joint bandwidth of 0.000354

## Picking joint bandwidth of 0.00443
## Picking joint bandwidth of 0.00121
## Picking joint bandwidth of 0.000596

## Picking joint bandwidth of 0.00488
## Picking joint bandwidth of 0.00141
## Picking joint bandwidth of 0.000435

Random forests and permutation tests

ASSESSING MULTICOLLINEARITY

Pairwise Spearman correlations

par(mfrow=c(2,1))

# === GRADED READERS ===
readers_corr <- dplyr::select(readers_corpus, -c(1, 2))
readers_corr <- cor(readers_corr, method = "spearman")
readers_corr <- corrplot(readers_corr, type = "lower", order = "hclust",
                         title = "Graded readers", mar=c(0,0,1,0), tl.cex = 1,
                         tl.col = "black", tl.srt = 45, cex.main = 1.5)

readers_corr_matrix <- dplyr::select(readers_corpus, -c(1, 2)) %>% cor(method = "spearman")
readers_corr_long <- melt(readers_corr_matrix, varnames = c("Var1", "Var2"), value.name = "Correlation")
readers_corr_long <- readers_corr_long %>% filter(Var1 != Var2)
readers_corr_long <- readers_corr_long %>%
  mutate(Var1_order = pmin(as.character(Var1), as.character(Var2)),
         Var2_order = pmax(as.character(Var1), as.character(Var2))) %>%
  distinct(Var1_order, Var2_order, .keep_all = TRUE)

readers_strongest_positive <- readers_corr_long %>% filter(Correlation > 0) %>%
  arrange(desc(Correlation))

readers_strongest_negative <- readers_corr_long %>% filter(Correlation < 0) %>%
  arrange(Correlation)

# === LITERARY WORKS ===
literature_corr <- dplyr::select(literature_corpus, -c(1, 2))
literature_corr <- cor(literature_corr, method = "spearman")
literature_corr <- corrplot(literature_corr, type = "lower", order = "hclust",
                            title = "Literary works", mar=c(0,0,1,0), tl.cex = 1,
                            tl.col = "black", tl.srt = 45, cex.main = 1.5)

literature_corr_matrix <- dplyr::select(literature_corpus, -c(1, 2)) %>% cor(method = "spearman")
literature_corr_long <- melt(literature_corr_matrix, varnames = c("Var1", "Var2"), value.name = "Correlation")
literature_corr_long <- literature_corr_long %>% filter(Var1 != Var2)
literature_corr_long <- literature_corr_long %>%
  mutate(Var1_order = pmin(as.character(Var1), as.character(Var2)),
         Var2_order = pmax(as.character(Var1), as.character(Var2))) %>%
  distinct(Var1_order, Var2_order, .keep_all = TRUE)

literature_strongest_positive <- literature_corr_long %>% filter(Correlation > 0) %>%
  arrange(desc(Correlation))

literature_strongest_negative <- literature_corr_long %>% filter(Correlation < 0) %>%
  arrange(Correlation)

RANDOM FORESTS

# === GRADED READERS ===

# --- Step 1. Split the data into a training (70%) and testing (30%) sets using a stratified random
# sampling approach ---
set.seed(93)

readers_training <- readers_corpus %>%
  group_by(Level) %>%
  slice_sample(prop=0.7)

readers_testing <- anti_join(readers_corpus, readers_training, by = "Title")

# --- Step 2. Fit the random forest model ---
set.seed(1)

readers_model <- randomForest(Level ~ . - Title, data = readers_training, mtry=6, ntree=183,
                              importance=TRUE, proximity=TRUE)
readers_model
## 
## Call:
##  randomForest(formula = Level ~ . - Title, data = readers_training,      mtry = 6, ntree = 183, importance = TRUE, proximity = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 183
## No. of variables tried at each split: 6
## 
##         OOB estimate of  error rate: 18.18%
## Confusion matrix:
##              Beginner Intermediate Advanced class.error
## Beginner           10            1        0  0.09090909
## Intermediate        2           10        1  0.23076923
## Advanced            0            2        7  0.22222222
# --- Step 3. Plot the out-of-bag error rate information ---
readers_obb_error_data <- data.frame(
  Trees=rep(1:nrow(readers_model$err.rate)),
  Type=rep(c("OOB", "Beginner", "Intermediate", "Advanced"), each=nrow(readers_model$err.rate)),
  Error=c(readers_model$err.rate[, "OOB"],
          readers_model$err.rate[, "Beginner"],
          readers_model$err.rate[, "Intermediate"],
          readers_model$err.rate[, "Advanced"])
)

readers_obb_error_data$Type <- factor(readers_obb_error_data$Type, levels = c("Beginner", "Intermediate", "Advanced", "OOB"))

ggplot(data=readers_obb_error_data, aes(x=Trees, y=Error)) +
  geom_line(aes(color=Type)) +
  ggtitle("Graded readers' out-of-bag (OOB) error rate information") +
  theme(legend.key.size = unit(1, 'cm'),
        legend.title = element_blank(),
        legend.text = element_text(size=20)) +
  theme(axis.text.x=element_text(size=15),
        axis.title.x=element_text(size=20),
        axis.text.y=element_text(size=18),
        axis.title.y=element_text(size=20)) +
  ylim(0, 0.9)

# --- Step 4. Predict classes for the test set ---
readers_predict <- predict(readers_model, readers_testing)
confusionMatrix(readers_predict, readers_testing$Level)
## Confusion Matrix and Statistics
## 
##               Reference
## Prediction     Beginner Intermediate Advanced
##   Beginner            6            0        2
##   Intermediate        0            4        1
##   Advanced            0            2        2
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7059          
##                  95% CI : (0.4404, 0.8969)
##     No Information Rate : 0.3529          
##     P-Value [Acc > NIR] : 0.003268        
##                                           
##                   Kappa : 0.555           
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Beginner Class: Intermediate Class: Advanced
## Sensitivity                   1.0000              0.6667          0.4000
## Specificity                   0.8182              0.9091          0.8333
## Pos Pred Value                0.7500              0.8000          0.5000
## Neg Pred Value                1.0000              0.8333          0.7692
## Prevalence                    0.3529              0.3529          0.2941
## Detection Rate                0.3529              0.2353          0.1176
## Detection Prevalence          0.4706              0.2941          0.2353
## Balanced Accuracy             0.9091              0.7879          0.6167
# --- Step 5. Let's plot the dot charts of variable importance ---
varImpPlot(readers_model,
           sort = T,
           n.var = 40,
           main = "Graded readers' random forest variable importance")

# --- Step 6. Multi-dimensional Scaling Plot of Proximity Matrix ---
readers_distance.matrix <- dist(1-readers_model$proximity)
readers_mds.stuff <- cmdscale(readers_distance.matrix, eig=TRUE, x.ret=TRUE)
readers_mds.var.per <- round(readers_mds.stuff$eig/sum(readers_mds.stuff$eig)*100, 1)
readers_mds.values <- readers_mds.stuff$points
readers_mds.data <- data.frame(Sample=rownames(readers_mds.values),
                               X=readers_mds.values[,1],
                               Y=readers_mds.values[,2],
                               Status=readers_training$Level)

ggplot(data=readers_mds.data, aes(x=X, y=Y, label=Sample)) +
  geom_text(size = 10, aes(color=Status)) +
  theme_bw() +
  xlab(paste("MDS1 - ", readers_mds.var.per[1], "%", sep="")) +
  ylab(paste("MDS2 - ", readers_mds.var.per[2], "%", sep="")) +
  ggtitle("GRs MDS plot (1 - RF proximities)") +
  theme(legend.title = element_blank(),
        legend.text = element_text(size=20)) +
  theme(axis.text.x=element_text(size=15),
        axis.title.x=element_text(size=20),
        axis.text.y=element_text(size=18),
        axis.title.y=element_text(size=20)) +
  theme(plot.title = element_text(hjust = 0.5, size=30)) 

# === LITERARY WORKS ===

# --- Step 1. Split the data into a training (70%) and testing (30%) sets using a stratified random
# sampling approach ---
set.seed(45)

literature_training <- literature_corpus %>%
  group_by(Level) %>%
  slice_sample(prop=0.7)

literature_testing <- anti_join(literature_corpus, literature_training, by = "Title")

#  --- Step 2. Fit the random forest model ---
set.seed(11)

literature_model <- randomForest(Level ~ . - Title, data = literature_training, mtry=6, ntree=131, importance=TRUE, proximity=TRUE)
literature_model
## 
## Call:
##  randomForest(formula = Level ~ . - Title, data = literature_training,      mtry = 6, ntree = 131, importance = TRUE, proximity = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 131
## No. of variables tried at each split: 6
## 
##         OOB estimate of  error rate: 17.14%
## Confusion matrix:
##                       Children's literature Young adult fiction
## Children's literature                    19                   2
## Young adult fiction                       2                   4
## Adult literature                          0                   1
##                       Adult literature class.error
## Children's literature                0   0.0952381
## Young adult fiction                  1   0.4285714
## Adult literature                     6   0.1428571
#  --- Step 3. Plot the out-of-bag error rate information ---
literature_obb_error_data <- data.frame(
  Trees=rep(1:nrow(literature_model$err.rate)),
  Type=rep(c("OOB", "Children's literature", "Young adult fiction", "Adult literature"), each=nrow(literature_model$err.rate)),
  Error=c(literature_model$err.rate[, "OOB"],
          literature_model$err.rate[, "Children's literature"],
          literature_model$err.rate[, "Young adult fiction"],
          literature_model$err.rate[, "Adult literature"])
)

literature_obb_error_data$Type <- factor(literature_obb_error_data$Type, levels = c("Children's literature", "Young adult fiction", "Adult literature", "OOB"))

ggplot(data=literature_obb_error_data, aes(x=Trees, y=Error)) +
  geom_line(aes(color=Type)) +
  theme(legend.key.size = unit(1, 'cm'),
        legend.title = element_blank(),
        legend.text = element_text(size=20)) +
  theme(axis.text.x=element_text(size=15),
        axis.title.x=element_text(size=20),
        axis.text.y=element_text(size=18),
        axis.title.y=element_text(size=20)) +
  ylim(0, 1)

#  --- Step 4. Predict classes for the test set ---
literature_predict <- predict(literature_model, literature_testing)
confusionMatrix(literature_predict, literature_testing$Level)
## Confusion Matrix and Statistics
## 
##                        Reference
## Prediction              Children's literature Young adult fiction
##   Children's literature                     7                   1
##   Young adult fiction                       1                   2
##   Adult literature                          1                   0
##                        Reference
## Prediction              Adult literature
##   Children's literature                0
##   Young adult fiction                  1
##   Adult literature                     2
## 
## Overall Statistics
##                                          
##                Accuracy : 0.7333         
##                  95% CI : (0.449, 0.9221)
##     No Information Rate : 0.6            
##     P-Value [Acc > NIR] : 0.2173         
##                                          
##                   Kappa : 0.5455         
##                                          
##  Mcnemar's Test P-Value : 0.5724         
## 
## Statistics by Class:
## 
##                      Class: Children's literature Class: Young adult fiction
## Sensitivity                                0.7778                     0.6667
## Specificity                                0.8333                     0.8333
## Pos Pred Value                             0.8750                     0.5000
## Neg Pred Value                             0.7143                     0.9091
## Prevalence                                 0.6000                     0.2000
## Detection Rate                             0.4667                     0.1333
## Detection Prevalence                       0.5333                     0.2667
## Balanced Accuracy                          0.8056                     0.7500
##                      Class: Adult literature
## Sensitivity                           0.6667
## Specificity                           0.9167
## Pos Pred Value                        0.6667
## Neg Pred Value                        0.9167
## Prevalence                            0.2000
## Detection Rate                        0.1333
## Detection Prevalence                  0.2000
## Balanced Accuracy                     0.7917
#  --- Step 5. Let's plot the dot charts of variable importance ---
varImpPlot(literature_model,
           sort = T,
           n.var = 40,
           main = "Literary works' random forest variable importance")

#  --- Step 6. Multi-dimensional Scaling Plot of Proximity Matrix ---
literature_distance.matrix <- dist(1-literature_model$proximity)
literature_mds.stuff <- cmdscale(literature_distance.matrix, eig=TRUE, x.ret=TRUE)
literature_mds.var.per <- round(literature_mds.stuff$eig/sum(literature_mds.stuff$eig)*100, 1)
literature_mds.values <- literature_mds.stuff$points
literature_mds.data <- data.frame(Sample=rownames(literature_mds.values),
                                  X=literature_mds.values[,1],
                                  Y=literature_mds.values[,2],
                                  Status=literature_training$Level)

ggplot(data=literature_mds.data, aes(x=X, y=Y, label=Sample)) +
  geom_text(size = 10, aes(color=Status)) +
  theme_bw() +
  xlab(paste("MDS1 - ", literature_mds.var.per[1], "%", sep="")) +
  ylab(paste("MDS2 - ", literature_mds.var.per[2], "%", sep="")) +
  ggtitle("LWs MDS plot (1 - RF proximities)") +
  theme(legend.title = element_blank(),
        legend.text = element_text(size=20)) +
  theme(axis.text.x=element_text(size=15),
        axis.title.x=element_text(size=20),
        axis.text.y=element_text(size=18),
        axis.title.y=element_text(size=20)) +
  theme(plot.title = element_text(hjust = 0.5, size=30)) 

# === BASELINE CLASSIFIERS ===

#  --- 1. Zero-rate ---

# Find the most frequent class in readers_testing
most_frequent_class_readers <- names(which.max(table(readers_testing$Level)))

# Predict all instances as the most frequent class
readers_zero_rate_pred <- factor(rep(most_frequent_class_readers, nrow(readers_testing)), 
                                 levels = levels(readers_testing$Level))

# Evaluate performance
confusionMatrix(readers_zero_rate_pred, readers_testing$Level)
## Confusion Matrix and Statistics
## 
##               Reference
## Prediction     Beginner Intermediate Advanced
##   Beginner            6            6        5
##   Intermediate        0            0        0
##   Advanced            0            0        0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.3529          
##                  95% CI : (0.1421, 0.6167)
##     No Information Rate : 0.3529          
##     P-Value [Acc > NIR] : 0.5903          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Beginner Class: Intermediate Class: Advanced
## Sensitivity                   1.0000              0.0000          0.0000
## Specificity                   0.0000              1.0000          1.0000
## Pos Pred Value                0.3529                 NaN             NaN
## Neg Pred Value                   NaN              0.6471          0.7059
## Prevalence                    0.3529              0.3529          0.2941
## Detection Rate                0.3529              0.0000          0.0000
## Detection Prevalence          1.0000              0.0000          0.0000
## Balanced Accuracy             0.5000              0.5000          0.5000
# Repeat for literature_testing
most_frequent_class_literature <- names(which.max(table(literature_testing$Level)))

literature_zero_rate_pred <- factor(rep(most_frequent_class_literature, nrow(literature_testing)), 
                                    levels = levels(literature_testing$Level))

confusionMatrix(literature_zero_rate_pred, literature_testing$Level)
## Confusion Matrix and Statistics
## 
##                        Reference
## Prediction              Children's literature Young adult fiction
##   Children's literature                     9                   3
##   Young adult fiction                       0                   0
##   Adult literature                          0                   0
##                        Reference
## Prediction              Adult literature
##   Children's literature                3
##   Young adult fiction                  0
##   Adult literature                     0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6             
##                  95% CI : (0.3229, 0.8366)
##     No Information Rate : 0.6             
##     P-Value [Acc > NIR] : 0.6098          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Children's literature Class: Young adult fiction
## Sensitivity                                   1.0                        0.0
## Specificity                                   0.0                        1.0
## Pos Pred Value                                0.6                        NaN
## Neg Pred Value                                NaN                        0.8
## Prevalence                                    0.6                        0.2
## Detection Rate                                0.6                        0.0
## Detection Prevalence                          1.0                        0.0
## Balanced Accuracy                             0.5                        0.5
##                      Class: Adult literature
## Sensitivity                              0.0
## Specificity                              1.0
## Pos Pred Value                           NaN
## Neg Pred Value                           0.8
## Prevalence                               0.2
## Detection Rate                           0.0
## Detection Prevalence                     0.0
## Balanced Accuracy                        0.5
#  --- 2. Weighted guessing ---
set.seed(123)

# Compute class probabilities based on readers_training
class_probs_readers <- prop.table(table(readers_training$Level))

# Predict based on these probabilities
readers_weighted_pred <- factor(sample(names(class_probs_readers), 
                                       size = nrow(readers_testing), 
                                       replace = TRUE, 
                                       prob = class_probs_readers),
                                levels = levels(readers_testing$Level))

# Evaluate performance
confusionMatrix(readers_weighted_pred, readers_testing$Level)
## Confusion Matrix and Statistics
## 
##               Reference
## Prediction     Beginner Intermediate Advanced
##   Beginner            3            3        1
##   Intermediate        2            1        1
##   Advanced            1            2        3
## 
## Overall Statistics
##                                           
##                Accuracy : 0.4118          
##                  95% CI : (0.1844, 0.6708)
##     No Information Rate : 0.3529          
##     P-Value [Acc > NIR] : 0.3912          
##                                           
##                   Kappa : 0.1192          
##                                           
##  Mcnemar's Test P-Value : 0.9115          
## 
## Statistics by Class:
## 
##                      Class: Beginner Class: Intermediate Class: Advanced
## Sensitivity                   0.5000             0.16667          0.6000
## Specificity                   0.6364             0.72727          0.7500
## Pos Pred Value                0.4286             0.25000          0.5000
## Neg Pred Value                0.7000             0.61538          0.8182
## Prevalence                    0.3529             0.35294          0.2941
## Detection Rate                0.1765             0.05882          0.1765
## Detection Prevalence          0.4118             0.23529          0.3529
## Balanced Accuracy             0.5682             0.44697          0.6750
# Repeat for literature_testing
class_probs_literature <- prop.table(table(literature_training$Level))

literature_weighted_pred <- factor(sample(names(class_probs_literature), 
                                          size = nrow(literature_testing), 
                                          replace = TRUE, 
                                          prob = class_probs_literature),
                                   levels = levels(literature_testing$Level))

confusionMatrix(literature_weighted_pred, literature_testing$Level)
## Confusion Matrix and Statistics
## 
##                        Reference
## Prediction              Children's literature Young adult fiction
##   Children's literature                     2                   3
##   Young adult fiction                       3                   0
##   Adult literature                          4                   0
##                        Reference
## Prediction              Adult literature
##   Children's literature                1
##   Young adult fiction                  2
##   Adult literature                     0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.1333          
##                  95% CI : (0.0166, 0.4046)
##     No Information Rate : 0.6             
##     P-Value [Acc > NIR] : 1.0000          
##                                           
##                   Kappa : -0.3542         
##                                           
##  Mcnemar's Test P-Value : 0.2839          
## 
## Statistics by Class:
## 
##                      Class: Children's literature Class: Young adult fiction
## Sensitivity                                0.2222                     0.0000
## Specificity                                0.3333                     0.5833
## Pos Pred Value                             0.3333                     0.0000
## Neg Pred Value                             0.2222                     0.7000
## Prevalence                                 0.6000                     0.2000
## Detection Rate                             0.1333                     0.0000
## Detection Prevalence                       0.4000                     0.3333
## Balanced Accuracy                          0.2778                     0.2917
##                      Class: Adult literature
## Sensitivity                           0.0000
## Specificity                           0.6667
## Pos Pred Value                        0.0000
## Neg Pred Value                        0.7273
## Prevalence                            0.2000
## Detection Rate                        0.0000
## Detection Prevalence                  0.2667
## Balanced Accuracy                     0.3333
#  --- 3. 1,000-fold randomization ---

set.seed(2)

n_iter <- 1000
metric_names <- c("Accuracy", "Kappa", "Sensitivity", "Specificity", 
                  "Precision", "Recall", "F1", "Prevalence", 
                  "Detection Rate", "Detection Prevalence", "Balanced Accuracy")

# --- For Graded Readers ---
levels_readers <- levels(readers_testing$Level)
# Create a nested list to store metrics per gradation level
random_metrics_readers <- list()
for (lev in levels_readers) {
  random_metrics_readers[[lev]] <- list()
  for (metric in metric_names) {
    random_metrics_readers[[lev]][[metric]] <- numeric(n_iter)
  }
}

# For each iteration, randomly shuffle predictions and compute one-vs-all confusion matrices
for (i in 1:n_iter) {
  # Randomly shuffle class labels for the entire test set
  random_labels <- sample(readers_testing$Level)
  
  # For each gradation level, compute a binary confusion matrix
  for (lev in levels_readers) {
    # Create binary predictions: TRUE if the random label equals the current level, FALSE otherwise
    pred_binary <- factor(random_labels == lev, levels = c(FALSE, TRUE))
    actual_binary <- factor(readers_testing$Level == lev, levels = c(FALSE, TRUE))
    
    # Compute confusion matrix for binary classification, specifying "TRUE" as the positive class
    cm <- confusionMatrix(pred_binary, actual_binary, positive = "TRUE")
    
    # Store overall metrics from cm$overall and per-class metrics from cm$byClass
    random_metrics_readers[[lev]][["Accuracy"]][i]            <- cm$overall["Accuracy"]
    random_metrics_readers[[lev]][["Kappa"]][i]               <- cm$overall["Kappa"]
    random_metrics_readers[[lev]][["Sensitivity"]][i]         <- cm$byClass["Sensitivity"]
    random_metrics_readers[[lev]][["Specificity"]][i]         <- cm$byClass["Specificity"]
    random_metrics_readers[[lev]][["Precision"]][i]           <- cm$byClass["Precision"]
    random_metrics_readers[[lev]][["Recall"]][i]              <- cm$byClass["Recall"]
    random_metrics_readers[[lev]][["F1"]][i]                  <- cm$byClass["F1"]
    random_metrics_readers[[lev]][["Prevalence"]][i]          <- cm$byClass["Prevalence"]
    random_metrics_readers[[lev]][["Detection Rate"]][i]      <- cm$byClass["Detection Rate"]
    random_metrics_readers[[lev]][["Detection Prevalence"]][i] <- cm$byClass["Detection Prevalence"]
    random_metrics_readers[[lev]][["Balanced Accuracy"]][i]   <- cm$byClass["Balanced Accuracy"]
  }
}

# Compute summary statistics (mean and sd) for each metric per gradation
summary_random_readers <- lapply(random_metrics_readers, function(class_metrics) {
  sapply(class_metrics, function(x) c(mean = mean(x), sd = sd(x)))
})

# --- For Literary Works ---
levels_literature <- levels(literature_testing$Level)
random_metrics_literature <- list()
for (lev in levels_literature) {
  random_metrics_literature[[lev]] <- list()
  for (metric in metric_names) {
    random_metrics_literature[[lev]][[metric]] <- numeric(n_iter)
  }
}

for (i in 1:n_iter) {
  random_labels <- sample(literature_testing$Level)
  
  for (lev in levels_literature) {
    pred_binary <- factor(random_labels == lev, levels = c(FALSE, TRUE))
    actual_binary <- factor(literature_testing$Level == lev, levels = c(FALSE, TRUE))
    
    cm <- confusionMatrix(pred_binary, actual_binary, positive = "TRUE")
    
    random_metrics_literature[[lev]][["Accuracy"]][i]            <- cm$overall["Accuracy"]
    random_metrics_literature[[lev]][["Kappa"]][i]               <- cm$overall["Kappa"]
    random_metrics_literature[[lev]][["Sensitivity"]][i]         <- cm$byClass["Sensitivity"]
    random_metrics_literature[[lev]][["Specificity"]][i]         <- cm$byClass["Specificity"]
    random_metrics_literature[[lev]][["Precision"]][i]           <- cm$byClass["Precision"]
    random_metrics_literature[[lev]][["Recall"]][i]              <- cm$byClass["Recall"]
    random_metrics_literature[[lev]][["F1"]][i]                  <- cm$byClass["F1"]
    random_metrics_literature[[lev]][["Prevalence"]][i]          <- cm$byClass["Prevalence"]
    random_metrics_literature[[lev]][["Detection Rate"]][i]      <- cm$byClass["Detection Rate"]
    random_metrics_literature[[lev]][["Detection Prevalence"]][i] <- cm$byClass["Detection Prevalence"]
    random_metrics_literature[[lev]][["Balanced Accuracy"]][i]   <- cm$byClass["Balanced Accuracy"]
  }
}

summary_random_literature <- lapply(random_metrics_literature, function(class_metrics) {
  sapply(class_metrics, function(x) c(mean = mean(x), sd = sd(x)))
})

# --- Print the results ---
cat("\n**1,000-Fold Randomization Results (Graded Readers)**\n")
## 
## **1,000-Fold Randomization Results (Graded Readers)**
for (lev in levels_readers) {
  cat("\n--", lev, "--\n")
  print(summary_random_readers[[lev]])
}
## 
## -- Beginner --
##       Accuracy        Kappa Sensitivity Specificity Precision    Recall  F1
## mean 0.5417647 -0.003257576   0.3508333  0.64590909 0.3508333 0.3508333 NaN
## sd   0.1107922  0.242567703   0.1569556  0.08561213 0.1569556 0.1569556  NA
##      Prevalence Detection Rate Detection Prevalence Balanced Accuracy
## mean  0.3529412     0.12382353            0.3529412         0.4983712
## sd    0.0000000     0.05539608            0.0000000         0.1212839
## 
## -- Intermediate --
##       Accuracy        Kappa Sensitivity Specificity Precision    Recall  F1
## mean 0.5416471 -0.003515152   0.3506667  0.64581818 0.3506667 0.3506667 NaN
## sd   0.1138274  0.249213047   0.1612555  0.08795755 0.1612555 0.1612555  NA
##      Prevalence Detection Rate Detection Prevalence Balanced Accuracy
## mean  0.3529412     0.12376471            0.3529412         0.4982424
## sd    0.0000000     0.05691371            0.0000000         0.1246065
## 
## -- Advanced --
##       Accuracy      Kappa Sensitivity Specificity Precision    Recall  F1
## mean 0.5828235 -0.0047000   0.2908000  0.70450000 0.2908000 0.2908000 NaN
## sd   0.1019284  0.2454777   0.1732783  0.07219931 0.1732783 0.1732783  NA
##      Prevalence Detection Rate Detection Prevalence Balanced Accuracy
## mean  0.2941176     0.08552941            0.2941176         0.4976500
## sd    0.0000000     0.05096422            0.0000000         0.1227388
cat("\n**1,000-Fold Randomization Results (Literary Works)**\n")
## 
## **1,000-Fold Randomization Results (Literary Works)**
for (lev in levels_literature) {
  cat("\n--", lev, "--\n")
  print(summary_random_literature[[lev]])
}
## 
## -- Children's literature --
##       Accuracy      Kappa Sensitivity Specificity Precision    Recall        F1
## mean 0.5164000 -0.0075000   0.5970000   0.3955000 0.5970000 0.5970000 0.5970000
## sd   0.1313751  0.2736981   0.1094792   0.1642188 0.1094792 0.1094792 0.1094792
##      Prevalence Detection Rate Detection Prevalence Balanced Accuracy
## mean        0.6     0.35820000                  0.6          0.496250
## sd          0.0     0.06568754                  0.0          0.136849
## 
## -- Young adult fiction --
##        Accuracy      Kappa Sensitivity Specificity Precision    Recall  F1
## mean 0.67840000 -0.0050000   0.1960000  0.79900000 0.1960000 0.1960000 NaN
## sd   0.08606692  0.2689591   0.2151673  0.05379182 0.2151673 0.2151673  NA
##      Prevalence Detection Rate Detection Prevalence Balanced Accuracy
## mean        0.2     0.03920000                  0.2         0.4975000
## sd          0.0     0.04303346                  0.0         0.1344796
## 
## -- Adult literature --
##        Accuracy       Kappa Sensitivity Specificity Precision    Recall  F1
## mean 0.68306667 0.009583333   0.2076667  0.80191667 0.2076667 0.2076667 NaN
## sd   0.08654271 0.270445971   0.2163568  0.05408919 0.2163568 0.2163568  NA
##      Prevalence Detection Rate Detection Prevalence Balanced Accuracy
## mean        0.2     0.04153333                  0.2         0.5047917
## sd          0.0     0.04327136                  0.0         0.1352230

PERMUTATION TESTS

Using the following variables

# 1) AVERAGE DEPTH OF THE CONTEXT TREES

# === Step 1. Calculate the difference in sample means: ===

GR_mean_ctxt_depth_A1A2 <- mean(readers_corpus$Ctxt_Avg_Depth[readers_corpus$Level=="Beginner"])
GR_mean_ctxt_depth_B1 <- mean(readers_corpus$Ctxt_Avg_Depth[readers_corpus$Level=="Intermediate"])
GR_mean_ctxt_depth_B2 <- mean(readers_corpus$Ctxt_Avg_Depth[readers_corpus$Level=="Advanced"])

LW_mean_ctxt_depth_A1A2 <- mean(literature_corpus$Ctxt_Avg_Depth[literature_corpus$Level=="Children's literature"])
LW_mean_ctxt_depth_B1 <- mean(literature_corpus$Ctxt_Avg_Depth[literature_corpus$Level=="Young adult fiction"])
LW_mean_ctxt_depth_B2 <- mean(literature_corpus$Ctxt_Avg_Depth[literature_corpus$Level=="Adult literature"])

RC_mean_ctxt_depth <- mean(reference_corpus$Ctxt_Avg_Depth)

# === Step 2. Calculate the absolute difference in means ===

GR_LW_mean_ctxt_depth_stat_A1A2 <- abs(GR_mean_ctxt_depth_A1A2 - LW_mean_ctxt_depth_A1A2)
GR_LW_mean_ctxt_depth_stat_B1 <- abs(GR_mean_ctxt_depth_B1 - LW_mean_ctxt_depth_B1)
GR_LW_mean_ctxt_depth_stat_B2 <- abs(GR_mean_ctxt_depth_B2 - LW_mean_ctxt_depth_B2)

GR_RC_mean_ctxt_depth_stat_A1A2 <- abs(GR_mean_ctxt_depth_A1A2 - RC_mean_ctxt_depth)
GR_RC_mean_ctxt_depth_stat_B1 <- abs(GR_mean_ctxt_depth_B1 - RC_mean_ctxt_depth)
GR_RC_mean_ctxt_depth_stat_B2 <- abs(GR_mean_ctxt_depth_B2 - RC_mean_ctxt_depth)

LW_RC_mean_ctxt_depth_stat_A1A2 <- abs(LW_mean_ctxt_depth_A1A2 - RC_mean_ctxt_depth)
LW_RC_mean_ctxt_depth_stat_B1 <- abs(LW_mean_ctxt_depth_B1 - RC_mean_ctxt_depth)
LW_RC_mean_ctxt_depth_stat_B2 <- abs(LW_mean_ctxt_depth_B2 - RC_mean_ctxt_depth)

# === Step 3. Perform the permutation test ===

# AVERAGE DEPTH OF THE CONTEXT TREES

# Number of observations to sample
GR_n <- length(readers_corpus$Level)
LW_n <- length(literature_corpus$Level)
RC_n <- length(reference_corpus$group)

# Number of permutation samples to take
P <- 10000

# Variable we will resample from
GR_ctxt_depth_variable <- readers_corpus$Ctxt_Avg_Depth
LW_ctxt_depth_variable <- literature_corpus$Ctxt_Avg_Depth
RC_ctxt_depth_variable <- reference_corpus$Ctxt_Avg_Depth

# Initialize a matrix to store the permutation data. Each column is a permutation sample of data.
GR_ctxt_depth_PermSamples <- matrix(0, nrow=GR_n, ncol=P)
LW_ctxt_depth_PermSamples <- matrix(0, nrow=LW_n, ncol=P)
RC_ctxt_depth_PermSamples <- matrix(0, nrow=RC_n, ncol=P)

set.seed(1979)

for(i in 1:P){
  GR_ctxt_depth_PermSamples[, i] <- sample(GR_ctxt_depth_variable, size=GR_n, replace=FALSE)
}

set.seed(1979)

for(i in 1:P){
  LW_ctxt_depth_PermSamples[, i] <- sample(LW_ctxt_depth_variable, size=LW_n, replace=FALSE)
}

set.seed(1979)

for(i in 1:P){
  RC_ctxt_depth_PermSamples[, i] <- sample(RC_ctxt_depth_variable, size=RC_n, replace=FALSE)
}

# Initialize vectors to store the test stats
Perm.test.stat_GR_LW_ctxt_depth_A1A2 <- rep(0, P)
Perm.test.stat_GR_LW_ctxt_depth_B1 <- rep(0, P)
Perm.test.stat_GR_LW_ctxt_depth_B2 <- rep(0, P)

Perm.test.stat_GR_RC_ctxt_depth_A1A2 <- rep(0, P)
Perm.test.stat_GR_RC_ctxt_depth_B1 <- rep(0, P)
Perm.test.stat_GR_RC_ctxt_depth_B2 <- rep(0, P)

Perm.test.stat_LW_RC_ctxt_depth_A1A2 <- rep(0, P)
Perm.test.stat_LW_RC_ctxt_depth_B1 <- rep(0, P)
Perm.test.stat_LW_RC_ctxt_depth_B2 <- rep(0, P)

# Loop through, and calculate test stats
for(i in 1:P){
  Perm.test.stat_GR_LW_ctxt_depth_A1A2[i] <- abs(mean(GR_ctxt_depth_PermSamples[
    readers_corpus$Level=="Beginner", i]) - mean(LW_ctxt_depth_PermSamples[
      literature_corpus$Level=="Children's literature", i]))
}
for(i in 1:P){
  Perm.test.stat_GR_LW_ctxt_depth_B1[i] <- abs(mean(GR_ctxt_depth_PermSamples[
    readers_corpus$Level=="Intermediate", i]) - mean(LW_ctxt_depth_PermSamples[
      literature_corpus$Level=="Young adult fiction", i]))
}
for(i in 1:P){
  Perm.test.stat_GR_LW_ctxt_depth_B2[i] <- abs(mean(GR_ctxt_depth_PermSamples[
    readers_corpus$Level=="Advanced", i]) - mean(LW_ctxt_depth_PermSamples[
      literature_corpus$Level=="Adult literature", i]))
}

for(i in 1:P){
  Perm.test.stat_GR_RC_ctxt_depth_A1A2[i] <- abs(mean(GR_ctxt_depth_PermSamples[
    readers_corpus$Level=="Beginner", i]) - mean(RC_ctxt_depth_PermSamples[
      reference_corpus$group==1, i]))
}
for(i in 1:P){
  Perm.test.stat_GR_RC_ctxt_depth_B1[i] <- abs(mean(GR_ctxt_depth_PermSamples[
    readers_corpus$Level=="Intermediate", i]) - mean(RC_ctxt_depth_PermSamples[
      reference_corpus$group==1, i]))
}
for(i in 1:P){
  Perm.test.stat_GR_RC_ctxt_depth_B2[i] <- abs(mean(GR_ctxt_depth_PermSamples[
    readers_corpus$Level=="Advanced", i]) - mean(RC_ctxt_depth_PermSamples[
      reference_corpus$group==1, i]))
}

# === Calculate the permutation p-value ===

# === GRs vs. LWs ===

# If there is no difference in the mean average depth of the context trees of beginner GRs and LWs, we would observe a test statistic of 1.091 or more by chance roughly 0% of the time.  
mean(Perm.test.stat_GR_LW_ctxt_depth_A1A2 >= GR_LW_mean_ctxt_depth_stat_A1A2)
## [1] 0
# If there is no difference in the mean average depth of the context trees of intermediate GRs and LWs, we would observe a test statistic of 0.028 or more by chance roughly 98.14% of the time.
mean(Perm.test.stat_GR_LW_ctxt_depth_B1 >= GR_LW_mean_ctxt_depth_stat_B1)
## [1] 0.9814
# If there is no difference in the mean average depth of the context trees of advanced GRs and LWs, we would observe a test statistic of 0.262 or more by chance roughly 74.50% of the time.
mean(Perm.test.stat_GR_LW_ctxt_depth_B2 >= GR_LW_mean_ctxt_depth_stat_B2)
## [1] 0.745
# === GRs vs. RC ===

# If there is no difference in the mean average depth of the context trees of beginner GRs and the RC, we would observe a test statistic of 2.999 or more by chance roughly 0% of the time.
mean(Perm.test.stat_GR_RC_ctxt_depth_A1A2 >= GR_RC_mean_ctxt_depth_stat_A1A2)
## [1] 0
# If there is no difference in the mean average depth of the context trees of intermediate GRs and the RC, we would observe a test statistic of 2.158 or more by chance roughly 77.81% of the time.
mean(Perm.test.stat_GR_RC_ctxt_depth_B1 >= GR_RC_mean_ctxt_depth_stat_B1)
## [1] 0.7781
# If there is no difference in the mean average depth of the context trees of advanced GRs and the RC, we would observe a test statistic of 1.559 or more by chance roughly 99.97% of the time.
mean(Perm.test.stat_GR_RC_ctxt_depth_B2 >= GR_RC_mean_ctxt_depth_stat_B2)
## [1] 0.9997
# === LW vs. RC ===

# If there is no difference in the mean average depth of the context trees of children's LW and the RC, we would observe a test statistic of 1.908 or more by chance roughly 0% of the time.
mean(Perm.test.stat_LW_RC_ctxt_depth_A1A2 >= LW_RC_mean_ctxt_depth_stat_A1A2)
## [1] 0
# If there is no difference in the mean average depth of the context trees of young adult LW and the RC, we would observe a test statistic of 2.186 or more by chance roughly 0% of the time.
mean(Perm.test.stat_LW_RC_ctxt_depth_B1 >= LW_RC_mean_ctxt_depth_stat_B1)
## [1] 0
# If there is no difference in the mean average depth of the context trees of adult LW and the RC, we would observe a test statistic of 1.296 or more by chance roughly 0% of the time.
mean(Perm.test.stat_LW_RC_ctxt_depth_B2 >= LW_RC_mean_ctxt_depth_stat_B2)
## [1] 0
# --------------------

# 2) RELATIVE FREQUENCY OF THE SUBJUNCTIVE MOOD

# === Step 1. Calculate the difference in sample means: ===

GR_mean_sub_A1A2 <- mean(readers_corpus$Freq_Sub[readers_corpus$Level=="Beginner"])
GR_mean_sub_B1 <- mean(readers_corpus$Freq_Sub[readers_corpus$Level=="Intermediate"])
GR_mean_sub_B2 <- mean(readers_corpus$Freq_Sub[readers_corpus$Level=="Advanced"])

LW_mean_sub_A1A2 <- mean(literature_corpus$Freq_Sub[literature_corpus$Level=="Children's literature"])
LW_mean_sub_B1 <- mean(literature_corpus$Freq_Sub[literature_corpus$Level=="Young adult fiction"])
LW_mean_sub_B2 <- mean(literature_corpus$Freq_Sub[literature_corpus$Level=="Adult literature"])

RC_mean_sub <- mean(reference_corpus$Freq_Sub)

# === Step 2. Calculate the absolute difference in means ===

GR_LW_mean_sub_stat_A1A2 <- abs(GR_mean_sub_A1A2 - LW_mean_sub_A1A2)
GR_LW_mean_sub_stat_B1 <- abs(GR_mean_sub_B1 - LW_mean_sub_B1)
GR_LW_mean_sub_stat_B2 <- abs(GR_mean_sub_B2 - LW_mean_sub_B2)

GR_RC_mean_sub_stat_A1A2 <- abs(GR_mean_sub_A1A2 - RC_mean_sub)
GR_RC_mean_sub_stat_B1 <- abs(GR_mean_sub_B1 - RC_mean_sub)
GR_RC_mean_sub_stat_B2 <- abs(GR_mean_sub_B2 - RC_mean_sub)

LW_RC_mean_sub_stat_A1A2 <- abs(LW_mean_sub_A1A2 - RC_mean_sub)
LW_RC_mean_sub_stat_B1 <- abs(LW_mean_sub_B1 - RC_mean_sub)
LW_RC_mean_sub_stat_B2 <- abs(LW_mean_sub_B2 - RC_mean_sub)

# === Step 3. Perform the permutation test ===

# Variable we will resample from
GR_sub_variable <- readers_corpus$Freq_Sub
LW_sub_variable <- literature_corpus$Freq_Sub
RC_sub_variable <- reference_corpus$Freq_Sub

# Initialize a matrix to store the permutation data. Each column is a permutation sample of data.
GR_sub_PermSamples <- matrix(0, nrow=GR_n, ncol=P)
LW_sub_PermSamples <- matrix(0, nrow=LW_n, ncol=P)
RC_sub_PermSamples <- matrix(0, nrow=RC_n, ncol=P)

set.seed(1979)

for(i in 1:P){
  GR_sub_PermSamples[, i] <- sample(GR_sub_variable, size=GR_n, replace=FALSE)
}

set.seed(1979)

for(i in 1:P){
  LW_sub_PermSamples[, i] <- sample(LW_sub_variable, size=LW_n, replace=FALSE)
}

set.seed(1979)

for(i in 1:P){
  RC_sub_PermSamples[, i] <- sample(RC_sub_variable, size=RC_n, replace=FALSE)
}

# Initialize vectors to store the test stats
Perm.test.stat_GR_LW_sub_A1A2 <- rep(0, P)
Perm.test.stat_GR_LW_sub_B1 <- rep(0, P)
Perm.test.stat_GR_LW_sub_B2 <- rep(0, P)

Perm.test.stat_GR_RC_sub_A1A2 <- rep(0, P)
Perm.test.stat_GR_RC_sub_B1 <- rep(0, P)
Perm.test.stat_GR_RC_sub_B2 <- rep(0, P)

Perm.test.stat_LW_RC_sub_A1A2 <- rep(0, P)
Perm.test.stat_LW_RC_sub_B1 <- rep(0, P)
Perm.test.stat_LW_RC_sub_B2 <- rep(0, P)

# Loop through, and calculate test stats
for(i in 1:P){
  Perm.test.stat_GR_LW_sub_A1A2[i] <- abs(mean(GR_sub_PermSamples[
    readers_corpus$Level=="Beginner", i]) - mean(LW_sub_PermSamples[
      literature_corpus$Level=="Children's literature", i]))
}
for(i in 1:P){
  Perm.test.stat_GR_LW_sub_B1[i] <- abs(mean(GR_sub_PermSamples[
    readers_corpus$Level=="Intermediate", i]) - mean(LW_sub_PermSamples[
      literature_corpus$Level=="Young adult fiction", i]))
}
for(i in 1:P){
  Perm.test.stat_GR_LW_sub_B2[i] <- abs(mean(GR_sub_PermSamples[
    readers_corpus$Level=="Advanced", i]) - mean(LW_sub_PermSamples[
      literature_corpus$Level=="Adult literature", i]))
}

for(i in 1:P){
  Perm.test.stat_GR_RC_sub_A1A2[i] <- abs(mean(GR_sub_PermSamples[
    readers_corpus$Level=="Beginner", i]) - mean(RC_sub_PermSamples[
      reference_corpus$group==1, i]))
}
for(i in 1:P){
  Perm.test.stat_GR_RC_sub_B1[i] <- abs(mean(GR_sub_PermSamples[
    readers_corpus$Level=="Intermediate", i]) - mean(RC_sub_PermSamples[
      reference_corpus$group==1, i]))
}
for(i in 1:P){
  Perm.test.stat_GR_RC_sub_B2[i] <- abs(mean(GR_sub_PermSamples[
    readers_corpus$Level=="Advanced", i]) - mean(RC_sub_PermSamples[
      reference_corpus$group==1, i]))
}

# === Calculate the permutation p-value ===

# === GRs vs. LWs ===
  
# If there is no difference in the mean average subjunctive frequencies of beginner GRs and LWs, we would observe a test statistic of 0.0039 or more by chance roughly 0.57% of the time.
mean(Perm.test.stat_GR_LW_sub_A1A2 >= GR_LW_mean_sub_stat_A1A2)
## [1] 0.0057
# If there is no difference in the mean average subjunctive frequencies of intermediate GRs and LWs, we would observe a test statistic of 0.002 or more by chance roughly 53.95% of the time.
mean(Perm.test.stat_GR_LW_sub_B1 >= GR_LW_mean_sub_stat_B1)
## [1] 0.5395
# If there is no difference in the mean average subjunctive frequencies of advanced GRs and LWs, we would observe a test statistic of 0.001 or more by chance roughly 77.23% of the time.
mean(Perm.test.stat_GR_LW_sub_B2 >= GR_LW_mean_sub_stat_B2)
## [1] 0.7723
# === GRs vs. RC ===

# If there is no difference in the mean average subjunctive frequencies of beginner GRs and RC, we would observe a test statistic of 0.0041 or more by chance roughly 0% of the time.
mean(Perm.test.stat_GR_RC_sub_A1A2 >= GR_RC_mean_sub_stat_A1A2)
## [1] 0
# If there is no difference in the mean average subjunctive frequencies of intermediate GRs and RC, we would observe a test statistic of 0.0014 or more by chance roughly 61.44% of the time.
mean(Perm.test.stat_GR_RC_sub_B1 >= GR_RC_mean_sub_stat_B1)
## [1] 0.6144
# If there is no difference in the mean average subjunctive frequencies of advanced GRs and RC, we would observe a test statistic of 0.0012 or more by chance roughly 68.73% of the time.
mean(Perm.test.stat_GR_RC_sub_B2 >= GR_RC_mean_sub_stat_B2)
## [1] 0.6873
# === LW vs. RC ===

# If there is no difference in the mean average subjunctive frequencies of beginner LW and RC, we would observe a test statistic of 0.00024 or more by chance roughly 0% of the time.
mean(Perm.test.stat_LW_RC_sub_A1A2 >= LW_RC_mean_sub_stat_A1A2)
## [1] 0
# If there is no difference in the mean average subjunctive frequencies of intermediate LW and RC, we would observe a test statistic of 0.0006 or more by chance roughly 0% of the time.
mean(Perm.test.stat_LW_RC_sub_B1 >= LW_RC_mean_sub_stat_B1)
## [1] 0
# If there is no difference in the mean average subjunctive frequencies of advanced LW and RC, we would observe a test statistic of 0.0023 or more by chance roughly 0% of the time.
mean(Perm.test.stat_LW_RC_sub_B2 >= LW_RC_mean_sub_stat_B2)
## [1] 0
# --------------------

# 3) RELATIVE FREQUENCY OF A1-A2 WORDS

# === Step 1. Calculate the difference in sample means: ===

GR_mean_A1A2_A1A2 <- mean(readers_corpus$Freq_A1A2[readers_corpus$Level=="Beginner"])
GR_mean_A1A2_B1 <- mean(readers_corpus$Freq_A1A2[readers_corpus$Level=="Intermediate"])
GR_mean_A1A2_B2 <- mean(readers_corpus$Freq_A1A2[readers_corpus$Level=="Advanced"])

LW_mean_A1A2_A1A2 <- mean(literature_corpus$Freq_A1A2[literature_corpus$Level=="Children's literature"])
LW_mean_A1A2_B1 <- mean(literature_corpus$Freq_A1A2[literature_corpus$Level=="Young adult fiction"])
LW_mean_A1A2_B2 <- mean(literature_corpus$Freq_A1A2[literature_corpus$Level=="Adult literature"])

RC_mean_A1A2 <- mean(reference_corpus$Freq_A1A2)

# === Step 2. Calculate the absolute difference in means ===

GR_LW_mean_A1A2_stat_A1A2 <- abs(GR_mean_A1A2_A1A2 - LW_mean_A1A2_A1A2)
GR_LW_mean_A1A2_stat_B1 <- abs(GR_mean_A1A2_B1 - LW_mean_A1A2_B1)
GR_LW_mean_A1A2_stat_B2 <- abs(GR_mean_A1A2_B2 - LW_mean_A1A2_B2)

GR_RC_mean_A1A2_stat_A1A2 <- abs(GR_mean_A1A2_A1A2 - RC_mean_A1A2)
GR_RC_mean_A1A2_stat_B1 <- abs(GR_mean_A1A2_B1 - RC_mean_A1A2)
GR_RC_mean_A1A2_stat_B2 <- abs(GR_mean_A1A2_B2 - RC_mean_A1A2)

LW_RC_mean_A1A2_stat_A1A2 <- abs(LW_mean_A1A2_A1A2 - RC_mean_A1A2)
LW_RC_mean_A1A2_stat_B1 <- abs(LW_mean_A1A2_B1 - RC_mean_A1A2)
LW_RC_mean_A1A2_stat_B2 <- abs(LW_mean_A1A2_B2 - RC_mean_A1A2)

# === Step 3. Perform the permutation test ===

# Variable we will resample from
GR_A1A2_variable <- readers_corpus$Freq_A1A2
LW_A1A2_variable <- literature_corpus$Freq_A1A2
RC_A1A2_variable <- reference_corpus$Freq_A1A2

# Initialize a matrix to store the permutation data. Each column is a permutation sample of data.
GR_A1A2_PermSamples <- matrix(0, nrow=GR_n, ncol=P)
LW_A1A2_PermSamples <- matrix(0, nrow=LW_n, ncol=P)
RC_A1A2_PermSamples <- matrix(0, nrow=RC_n, ncol=P)

set.seed(1979)

for(i in 1:P){
  GR_A1A2_PermSamples[, i] <- sample(GR_A1A2_variable, size=GR_n, replace=FALSE)
}

set.seed(1979)

for(i in 1:P){
  LW_A1A2_PermSamples[, i] <- sample(LW_A1A2_variable, size=LW_n, replace=FALSE)
}

set.seed(1979)

for(i in 1:P){
  RC_A1A2_PermSamples[, i] <- sample(RC_A1A2_variable, size=RC_n, replace=FALSE)
}

# Initialize vectors to store the test stats
Perm.test.stat_GR_LW_A1A2_A1A2 <- rep(0, P)
Perm.test.stat_GR_LW_A1A2_B1 <- rep(0, P)
Perm.test.stat_GR_LW_A1A2_B2 <- rep(0, P)

Perm.test.stat_GR_RC_A1A2_A1A2 <- rep(0, P)
Perm.test.stat_GR_RC_A1A2_B1 <- rep(0, P)
Perm.test.stat_GR_RC_A1A2_B2 <- rep(0, P)

Perm.test.stat_LW_RC_A1A2_A1A2 <- rep(0, P)
Perm.test.stat_LW_RC_A1A2_B1 <- rep(0, P)
Perm.test.stat_LW_RC_A1A2_B2 <- rep(0, P)

# Loop through, and calculate test stats
for(i in 1:P){
  Perm.test.stat_GR_LW_A1A2_A1A2[i] <- abs(mean(GR_A1A2_PermSamples[
    readers_corpus$Level=="Beginner", i]) - mean(LW_A1A2_PermSamples[
      literature_corpus$Level=="Children's literature", i]))
}
for(i in 1:P){
  Perm.test.stat_GR_LW_A1A2_B1[i] <- abs(mean(GR_A1A2_PermSamples[
    readers_corpus$Level=="Intermediate", i]) - mean(LW_A1A2_PermSamples[
      literature_corpus$Level=="Young adult fiction", i]))
}
for(i in 1:P){
  Perm.test.stat_GR_LW_A1A2_B2[i] <- abs(mean(GR_A1A2_PermSamples[
    readers_corpus$Level=="Advanced", i]) - mean(LW_A1A2_PermSamples[
      literature_corpus$Level=="Adult literature", i]))
}

for(i in 1:P){
  Perm.test.stat_GR_RC_A1A2_A1A2[i] <- abs(mean(GR_A1A2_PermSamples[
    readers_corpus$Level=="Beginner", i]) - mean(RC_A1A2_PermSamples[
      reference_corpus$group==1, i]))
}
for(i in 1:P){
  Perm.test.stat_GR_RC_A1A2_B1[i] <- abs(mean(GR_A1A2_PermSamples[
    readers_corpus$Level=="Intermediate", i]) - mean(RC_A1A2_PermSamples[
      reference_corpus$group==1, i]))
}
for(i in 1:P){
  Perm.test.stat_GR_RC_A1A2_B2[i] <- abs(mean(GR_A1A2_PermSamples[
    readers_corpus$Level=="Advanced", i]) - mean(RC_A1A2_PermSamples[
      reference_corpus$group==1, i]))
}

# === Calculate the permutation p-value ===

# === GRs vs. LWs ===
  
# If there is no difference in the mean average A1-A2 word frequencies of beginner GRs and LWs, we would observe a test statistic of 0.033 or more by chance roughly 0.15% of the time.
mean(Perm.test.stat_GR_LW_A1A2_A1A2 >= GR_LW_mean_A1A2_stat_A1A2)
## [1] 0.0015
# If there is no difference in the mean average A1-A2 word frequencies of intermediate GRs and LWs, we would observe a test statistic of 0.014 or more by chance roughly 59.33% of the time.
mean(Perm.test.stat_GR_LW_A1A2_B1 >= GR_LW_mean_A1A2_stat_B1)
## [1] 0.5933
# If there is no difference in the mean average A1-A2 word frequencies of advanced GRs and LWs, we would observe a test statistic of 0.013 or more by chance roughly 62.89% of the time.
mean(Perm.test.stat_GR_LW_A1A2_B2 >= GR_LW_mean_A1A2_stat_B2)
## [1] 0.6289
# === GRs vs. RC ===

# If there is no difference in the mean average A1-A2 word frequencies of beginner GRs and the RC, we would observe a test statistic of 0.103 or more by chance roughly 0% of the time.
mean(Perm.test.stat_GR_RC_A1A2_A1A2 >= GR_RC_mean_A1A2_stat_A1A2)
## [1] 0
# If there is no difference in the mean average A1-A2 word frequencies of intermediate GRs and the RC, we would observe a test statistic of 0.079 or more by chance roughly 49.38% of the time.
mean(Perm.test.stat_GR_RC_A1A2_B1 >= GR_RC_mean_A1A2_stat_B1)
## [1] 0.4938
# If there is no difference in the mean average A1-A2 word frequencies of advanced GRs and the RC, we would observe a test statistic of 0.048 or more by chance roughly 100% of the time.
mean(Perm.test.stat_GR_RC_A1A2_B2 >= GR_RC_mean_A1A2_stat_B2)
## [1] 1
# === LW vs. RC ===

# If there is no difference in the mean average A1-A2 word frequencies of children LWs and the RC, we would observe a test statistic of 0.070 or more by chance roughly 0% of the time.
mean(Perm.test.stat_LW_RC_A1A2_A1A2 >= LW_RC_mean_A1A2_stat_A1A2)
## [1] 0
# If there is no difference in the mean average A1-A2 word frequencies of young adult LWs and the RC, we would observe a test statistic of 0.065 or more by chance roughly 0% of the time.
mean(Perm.test.stat_LW_RC_A1A2_B1 >= LW_RC_mean_A1A2_stat_B1)
## [1] 0
# If there is no difference in the mean average A1-A2 word frequencies of adult LW and the RC, we would observe a test statistic of 0.035 or more by chance roughly 0% of the time.
mean(Perm.test.stat_LW_RC_A1A2_B2 >= LW_RC_mean_A1A2_stat_B2)
## [1] 0
# --------------------

# 4) TFIDF OF THE GRADED VOCABULARY'S CONTEXT

# === Step 1. Calculate the difference in sample means: ===

GR_mean_ctxt_TFIDF_A1A2 <- mean(readers_corpus$Ctxt_TFIDF[readers_corpus$Level=="Beginner"])
GR_mean_ctxt_TFIDF_B1 <- mean(readers_corpus$Ctxt_TFIDF[readers_corpus$Level=="Intermediate"])
GR_mean_ctxt_TFIDF_B2 <- mean(readers_corpus$Ctxt_TFIDF[readers_corpus$Level=="Advanced"])

LW_mean_ctxt_TFIDF_A1A2 <- mean(literature_corpus$Ctxt_TFIDF[literature_corpus$Level=="Children's literature"])
LW_mean_ctxt_TFIDF_B1 <- mean(literature_corpus$Ctxt_TFIDF[literature_corpus$Level=="Young adult fiction"])
LW_mean_ctxt_TFIDF_B2 <- mean(literature_corpus$Ctxt_TFIDF[literature_corpus$Level=="Adult literature"])

RC_mean_ctxt_TFIDF <- mean(reference_corpus$Ctxt_TFIDF)

# === Step 2. Calculate the absolute difference in means ===

GR_LW_mean_ctxt_TFIDF_stat_A1A2 <- abs(GR_mean_ctxt_TFIDF_A1A2 - LW_mean_ctxt_TFIDF_A1A2)
GR_LW_mean_ctxt_TFIDF_stat_B1 <- abs(GR_mean_ctxt_TFIDF_B1 - LW_mean_ctxt_TFIDF_B1)
GR_LW_mean_ctxt_TFIDF_stat_B2 <- abs(GR_mean_ctxt_TFIDF_B2 - LW_mean_ctxt_TFIDF_B2)

GR_RC_mean_ctxt_TFIDF_stat_A1A2 <- abs(GR_mean_ctxt_TFIDF_A1A2 - RC_mean_ctxt_TFIDF)
GR_RC_mean_ctxt_TFIDF_stat_B1 <- abs(GR_mean_ctxt_TFIDF_B1 - RC_mean_ctxt_TFIDF)
GR_RC_mean_ctxt_TFIDF_stat_B2 <- abs(GR_mean_ctxt_TFIDF_B2 - RC_mean_ctxt_TFIDF)

LW_RC_mean_ctxt_TFIDF_stat_A1A2 <- abs(LW_mean_ctxt_TFIDF_A1A2 - RC_mean_ctxt_TFIDF)
LW_RC_mean_ctxt_TFIDF_stat_B1 <- abs(LW_mean_ctxt_TFIDF_B1 - RC_mean_ctxt_TFIDF)
LW_RC_mean_ctxt_TFIDF_stat_B2 <- abs(LW_mean_ctxt_TFIDF_B2 - RC_mean_ctxt_TFIDF)

# === Step 3. Perform the permutation test ===

# Variable we will resample from
GR_ctxt_TFIDF_variable <- readers_corpus$Ctxt_TFIDF
LW_ctxt_TFIDF_variable <- literature_corpus$Ctxt_TFIDF
RC_ctxt_TFIDF_variable <- reference_corpus$Ctxt_TFIDF

# Initialize a matrix to store the permutation data. Each column is a permutation sample of data.
GR_ctxt_TFIDF_PermSamples <- matrix(0, nrow=GR_n, ncol=P)
LW_ctxt_TFIDF_PermSamples <- matrix(0, nrow=LW_n, ncol=P)
RC_ctxt_TFIDF_PermSamples <- matrix(0, nrow=RC_n, ncol=P)

set.seed(1979)

for(i in 1:P){
  GR_ctxt_TFIDF_PermSamples[, i] <- sample(GR_ctxt_TFIDF_variable, size=GR_n, replace=FALSE)
}

set.seed(1979)

for(i in 1:P){
  LW_ctxt_TFIDF_PermSamples[, i] <- sample(LW_ctxt_TFIDF_variable, size=LW_n, replace=FALSE)
}

set.seed(1979)

for(i in 1:P){
  RC_ctxt_TFIDF_PermSamples[, i] <- sample(RC_ctxt_TFIDF_variable, size=RC_n, replace=FALSE)
}

# Initialize vectors to store the test stats
Perm.test.stat_GR_LW_ctxt_TFIDF_A1A2 <- rep(0, P)
Perm.test.stat_GR_LW_ctxt_TFIDF_B1 <- rep(0, P)
Perm.test.stat_GR_LW_ctxt_TFIDF_B2 <- rep(0, P)

Perm.test.stat_GR_RC_ctxt_TFIDF_A1A2 <- rep(0, P)
Perm.test.stat_GR_RC_ctxt_TFIDF_B1 <- rep(0, P)
Perm.test.stat_GR_RC_ctxt_TFIDF_B2 <- rep(0, P)

Perm.test.stat_LW_RC_ctxt_TFIDF_A1A2 <- rep(0, P)
Perm.test.stat_LW_RC_ctxt_TFIDF_B1 <- rep(0, P)
Perm.test.stat_LW_RC_ctxt_TFIDF_B2 <- rep(0, P)

# Loop through, and calculate test stats
for(i in 1:P){
  Perm.test.stat_GR_LW_ctxt_TFIDF_A1A2[i] <- abs(mean(GR_ctxt_TFIDF_PermSamples[
    readers_corpus$Level=="Beginner", i]) - mean(LW_ctxt_TFIDF_PermSamples[
      literature_corpus$Level=="Children's literature", i]))
}
for(i in 1:P){
  Perm.test.stat_GR_LW_ctxt_TFIDF_B1[i] <- abs(mean(GR_ctxt_TFIDF_PermSamples[
    readers_corpus$Level=="Intermediate", i]) - mean(LW_ctxt_TFIDF_PermSamples[
      literature_corpus$Level=="Young adult fiction", i]))
}
for(i in 1:P){
  Perm.test.stat_GR_LW_ctxt_TFIDF_B2[i] <- abs(mean(GR_ctxt_TFIDF_PermSamples[
    readers_corpus$Level=="Advanced", i]) - mean(LW_ctxt_TFIDF_PermSamples[
      literature_corpus$Level=="Adult literature", i]))
}

for(i in 1:P){
  Perm.test.stat_GR_RC_ctxt_TFIDF_A1A2[i] <- abs(mean(GR_ctxt_TFIDF_PermSamples[
    readers_corpus$Level=="Beginner", i]) - mean(RC_ctxt_TFIDF_PermSamples[
      reference_corpus$group==1, i]))
}
for(i in 1:P){
  Perm.test.stat_GR_RC_ctxt_TFIDF_B1[i] <- abs(mean(GR_ctxt_TFIDF_PermSamples[
    readers_corpus$Level=="Intermediate", i]) - mean(RC_ctxt_TFIDF_PermSamples[
      reference_corpus$group==1, i]))
}
for(i in 1:P){
  Perm.test.stat_GR_RC_ctxt_TFIDF_B2[i] <- abs(mean(GR_ctxt_TFIDF_PermSamples[
    readers_corpus$Level=="Advanced", i]) - mean(RC_ctxt_TFIDF_PermSamples[
      reference_corpus$group==1, i]))
}

# === Calculate the permutation p-value ===

# === GRs vs. LWs ===
  
# If there is no difference in the mean average graded vocabulary's context TFIDF of beginner GRs and LWs, we would observe a test statistic of 0.00028 or more by chance roughly 0.86% of the time.
mean(Perm.test.stat_GR_LW_ctxt_TFIDF_A1A2 >= GR_LW_mean_ctxt_TFIDF_stat_A1A2)
## [1] 0.0086
# If there is no difference in the mean average graded vocabulary's context TFIDF of intermediate GRs and LWs, we would observe a test statistic of 0.00025 or more by chance roughly 11.41% of the time.
mean(Perm.test.stat_GR_LW_ctxt_TFIDF_B1 >= GR_LW_mean_ctxt_TFIDF_stat_B1)
## [1] 0.1141
# If there is no difference in the mean average graded vocabulary's context TFIDF of advanced GRs and LWs, we would observe a test statistic of 0.00053 or more by chance roughly 0.13% of the time.
mean(Perm.test.stat_GR_LW_ctxt_TFIDF_B2 >= GR_LW_mean_ctxt_TFIDF_stat_B2)
## [1] 0.0013
# === GRs vs. RC ===

# If there is no difference in the mean average graded vocabulary's context TFIDF of beginner GRs and the RC, we would observe a test statistic of 0.0072 or more by chance roughly 27.25% of the time.
mean(Perm.test.stat_GR_RC_ctxt_TFIDF_A1A2 >= GR_RC_mean_ctxt_TFIDF_stat_A1A2)
## [1] 0.2725
# If there is no difference in the mean average graded vocabulary's context TFIDF of intermediate GRs and the RC, we would observe a test statistic of 0.0072 or more by chance roughly 22.36% of the time.
mean(Perm.test.stat_GR_RC_ctxt_TFIDF_B1 >= GR_RC_mean_ctxt_TFIDF_stat_B1)
## [1] 0.2236
# If there is no difference in the mean average graded vocabulary's context TFIDF of advanced GRs and the RC, we would observe a test statistic of 0.0070 or more by chance roughly 92.58% of the time.
mean(Perm.test.stat_GR_RC_ctxt_TFIDF_B2 >= GR_RC_mean_ctxt_TFIDF_stat_B2)
## [1] 0.9258
# === LWs vs. RC ===

# If there is no difference in the mean average graded vocabulary's context TFIDF of beginner LWs and the RC, we would observe a test statistic of 0.0069 or more by chance roughly 0% of the time.
mean(Perm.test.stat_LW_RC_ctxt_TFIDF_A1A2 >= LW_RC_mean_ctxt_TFIDF_stat_A1A2)
## [1] 0
# If there is no difference in the mean average graded vocabulary's context TFIDF of intermediate LWs and the RC, we would observe a test statistic of 0.0075 or more by chance roughly 0% of the time.
mean(Perm.test.stat_LW_RC_ctxt_TFIDF_B1 >= LW_RC_mean_ctxt_TFIDF_stat_B1)
## [1] 0
# If there is no difference in the mean average graded vocabulary's context TFIDF of advanced LWs and the RC, we would observe a test statistic of 0.0076 or more by chance roughly 0% of the time.
mean(Perm.test.stat_LW_RC_ctxt_TFIDF_B2 >= LW_RC_mean_ctxt_TFIDF_stat_B2)
## [1] 0
# VISUALISATION OF THE MOST RELEVANT DIFFERENCES

# -- Average depth of the graded vocabulary context parse trees --

# Beginner vs. children's literature

# - Graded readers vs. literary works
# - Graded readers vs. reference corpus
# - Literary works vs. reference corpus

GR.LW.RC <- ggplot() +
  geom_histogram(aes(x=readers_corpus$Ctxt_Avg_Depth[
    readers_corpus$Level=="Beginner"], y=after_stat(density)), color = "white", fill = "#D14D72", bins = 100) +
  geom_vline(aes(xintercept = GR_mean_ctxt_depth_A1A2), color = "black", linetype="dotdash", linewidth = 0.5) +
  labs(x ='\nAverage depth of the parse trees of the sentences where the context of the graded vocabulary appears', y='Probability density\n', title = 'Comparison among all three corpora') +
  geom_density(aes(x=readers_corpus$Ctxt_Avg_Depth[
    readers_corpus$Level=="Beginner"]), color = "darkgray", linewidth = 1) +
  annotate(geom="text", x=3.8, y=3.1, label="Beginner graded readers", size=5) +
  geom_histogram(aes(x=literature_corpus$Ctxt_Avg_Depth[
    literature_corpus$Level=="Children's literature"], y=after_stat(density)), color = "white", fill = "#FFABAB", alpha=0.6, bins = 100) +
  geom_vline(aes(xintercept = LW_mean_ctxt_depth_A1A2), color = "black", linetype="dotdash", linewidth = 0.5) +
  annotate(geom="text", x=4.7, y=1, label="Children's literature", size=5) +
  geom_density(aes(x=literature_corpus$Ctxt_Avg_Depth[
    literature_corpus$Level=="Children's literature"]), color = "darkgray", linewidth = 1) +
  geom_histogram(aes(x=reference_corpus$Ctxt_Avg_Depth[
    reference_corpus$group==1], y=after_stat(density)), color = "black", fill = "#FEF2F4", alpha=0.8, bins = 100) +
    geom_vline(aes(xintercept = RC_mean_ctxt_depth), color = "black", linetype="dotdash", linewidth = 0.5) +
    annotate(geom="text", x=7, y=0.6, label="Reference corpus", size=5) +
    geom_density(aes(x=reference_corpus$Ctxt_Avg_Depth[
    reference_corpus$group==1]),color = "darkgray", linewidth = 1) +
  theme(plot.title = element_text(hjust = 0.5, size=18)) +
  theme(axis.title.y = element_text(hjust = 0.5, size=14)) +
  theme(axis.title.x = element_text(hjust = 0.5, size=14))

GR.LW.RC